Exemplo n.º 1
0
    def vannotate(self, reuse=False):
        '''
		objective: run varant (GCN) annotator
		input: self.vcf
		output: annotated vcf
		'''
        job_name = 'vannotate'
        msg = 'annotating VCF file[%s;%s] ...' % (job_name, self.vcf)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        # prepare output file
        varant_vcf = os.path.join(self.out_dir, 'divine.vcf')

        # if necessary, masking the raw vcf file
        coding_vcf = None
        if self.ref_exon_only > 0:
            if not lib_utils.check_if_file_valid(varant_vcf) or not reuse:
                cRef = annotateRegion.RefGeneUcscTB(work_dir=self.out_dir,
                                                    logger=self.logger)
                coding_bed_fn = cRef.create_bed(ext_bp=20, reuse=False)

                msg = 'extracting variants in coding region from [%s] @ %s ...' % (
                    self.vcf, job_name)
                lib_utils.msgout('notice', msg)
                self.logger.info(msg)

                coding_vcf = os.path.join(self.out_dir, 'refgene_e20.vcf')
                self.vcf = vcf_mask.by_bed(self.vcf,
                                           coding_bed_fn,
                                           coding_vcf,
                                           logger=self.logger)

                msg = 'done. @ %s' % job_name
                lib_utils.msgout('notice', msg)
                self.logger.info(msg)

        if not lib_utils.check_if_file_valid(varant_vcf) or not reuse:
            self.logger.info('annotating [%s,%s] ...' % (job_name, self.vcf))

            cmd = ["python", self.entries['varant'], \
               "-i", self.vcf, \
               "-o", varant_vcf, \
               "-l", self.log_dir]
            if self.capkit:
                cmd.extend(["-c", self.capkit, "-e", "180"])

            if self.hgmd > 0:
                cmd.extend(["--hgmd"])

            self.run_cmd(cmd, job_name)
        self.vcf = varant_vcf

        if coding_vcf:
            os.unlink(coding_vcf)

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Exemplo n.º 2
0
def split_file(file2, ncpu, min_num_lines=100):
    print 'splitting [%s] into %d pieces' % (file2, ncpu)
    D, fname, fbase, fext = lib_utils.separateDirFn2(file2)
    out_prefix = '%s/%s_parts' % (D, fbase)

    #compute the number of lines per a splitted file
    N = lib_utils.count_num_lines(file2)
    L = round(1. * N / ncpu) + 1
    if L <= min_num_lines:
        file2b = '%s_00' % out_prefix
        print 'the input file size is too small to split.'
        shutil.copyfile(file2, file2b)
        return [file2b]
    else:
        cmd = 'split -d -l %d %s %s' % (L, file2, out_prefix)
        lib_utils.gen_msg_time('cmd', cmd, 'split_file')  #debug
        upair_fns = []
        for j in range(ncpu):
            part_fn = '%s%02d' % (out_prefix, j)
            if lib_utils.check_if_file_valid(part_fn):
                upair_fns.append(part_fn)
            else:
                lib_utils.gen_msg_time('error',
                                       'file[%s] does not exist' % part_fn,
                                       'split_file')
        print 'done.'
        return upair_fns
Exemplo n.º 3
0
    def __init__(self, uargs):
        #transferring user input arguments to class member variables

        self.exp_tag = uargs.exp_tag
        self.vknown = uargs.vknown
        self.cadd = uargs.cadd

        self.excl_non_coding = False
        self.sparser = SafeConfigParser()

        self.pheno_dmg = {}
        self.genetic_dmg = {}
        self.gene_dmg = {}

        self.hpo2disease_fn = None
        self.pheno_dmg_fn = None
        self.hpo_query = None
        self.vcf = None
        self.xls = None
        self.hgmd = uargs.hgmd

        lib_utils.msgout('notice', 'initializing Divine ...', 'Divine')

        divine_root_dir = os.environ.get("DIVINE")
        if not divine_root_dir:
            raise EnvironmentError("set DIVINE variable properly!")

        config_fn = os.path.join(divine_root_dir, 'gcn', 'config',
                                 'divine.conf')

        if not lib_utils.check_if_file_valid(config_fn):
            raise IOError("check if the configuration file[%s] is valid!" %
                          config_fn)

        self.config_fn = config_fn
        self.entries = {'divine_root': divine_root_dir}
        self._set_args(uargs)

        # damage factor w.r.t the location of variant within the transcript
        self.dm = damaging_model.DmgCoeff(uargs.indel_mode, uargs.seed_rate,
                                          self.logger)

        if uargs.ref_exon_only == 1:
            msg = 'VCF will be masked by RefGene coding region'
            lib_utils.msgout('notice', msg)
            self.logger.info(msg)

        self.ref_exon_only = uargs.ref_exon_only

        lib_utils.msgout('notice', 'done. initialization')
Exemplo n.º 4
0
	def _read_config(self,vcf_filter_cfg=None):
		'''
		objective: read configuration file
		'''
		job_name = '_read_config'
		msg = 'reading configuration file [%s;%s] ...'%(job_name,self.config_fn)
		lib_utils.msgout('notice',msg);self.logger.info(msg)

		self.sparser.read(self.config_fn)

		self._set_config('program_paths', 'varant')
		self._set_config('program_paths', 'hposim')
		self._set_config('program_paths', 'vcf2xls')
		
		self._set_config('config', 'temp_dir')
		if not vcf_filter_cfg:
			self._set_config('config', 'vcf_filter_conf')
		else:
			if os.path.exists(vcf_filter_cfg):
				self.entries['vcf_filter_conf'] = vcf_filter_cfg
			else:
				raise RuntimeError('check if the file [%s] is valid'%vcf_filter_cfg)

		self._set_config('database', 'ext_disease_to_gene')

		self._set_config('database', 'disease_desc')

		self._set_config('database', 'hpo_obo')

		self._set_config('database', 'beta_fit')
		self._set_config('database', 'string_link')
		
		'''
		to access to UCSC mysql database(hg19)
		select e2g.value, gtp.protein from ensGtp as gtp
		inner join ensemblToGeneName as e2g on e2g.name=gtp.transcript;
		'''
		self._set_config('database', 'esp_to_gene')
		self._set_config('database', 'kegg_hsa')

		# check if the file or directory all exists before long journey!
		for key, path2 in self.entries.iteritems():
			if not lib_utils.check_if_file_valid(path2):
				raise IOError('check [%s = %s] in the file [%s]' %\
										(key, path2, self.config_fn))

		msg = 'done. [%s]' % job_name
		lib_utils.msgout('notice',msg);self.logger.info(msg)
		
		return self.entries
Exemplo n.º 5
0
	def annotate_comphet_inherit(self, reuse=False):
		if not self.ped: return
		msg = "for a multi-sample VCF containing parent genotypes, append inheritance model"
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		# to build database for transcript exon regions if not exist
		refgene_tx_fn = fileconfig.FILECONFIG['REFGENE']
		refgene_tx_fn_dir = os.path.dirname(os.path.abspath(refgene_tx_fn))
		genmod_db_dir = os.path.join(refgene_tx_fn_dir, 'genmod_db')
		if not os.path.exists(genmod_db_dir):
			os.makedirs(genmod_db_dir)

		if not os.path.exists(os.path.join(genmod_db_dir, 'genes.db')) or \
				not os.path.exists(os.path.join(genmod_db_dir, 'exons.db')):
			cmd = ["genmod", "build", \
						 "-t", "gene_pred", \
						 "--splice_padding", "2", \
						 "-o", genmod_db_dir, \
						 refgene_tx_fn]

			job_name = "annotate_inheritance.genmod_build_db"
			lib_utils.runcmd2(cmd, self.log_dir, self.logger, job_name)

		vcf_genmod_out = lib_utils.file_tag2(self.vcf, 'genmod', '')
		cmd = ["genmod", "annotate", \
					 "-r", \
					 self.vcf, \
					 "|", \
					 "genmod", "models", \
					 "-", \
					 "--family_file", self.ped, \
					 "-o", vcf_genmod_out]

		job_name = "annotate_inheritance.genmod_model"
		msg = "annotating inheritance model into VCF ..."
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)

		if not reuse or not lib_utils.check_if_file_valid(vcf_genmod_out):
			lib_utils.runcmd2(cmd, self.log_dir, self.logger, job_name)

		msg = "Done."
		lib_utils.msgout('notice', msg);
		self.logger.info(msg)
		self.vcf = vcf_genmod_out
Exemplo n.º 6
0
    def create_bed(self, ext_bp=0, reuse=False):

        job_name = 'RefGeneUcscTB.create_bed'
        
        self.bed_fn = os.path.join(self.work_dir,'refGene_e%d_so_merged.bed'%ext_bp)
        
        msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s'%(self.bed_fn,job_name)
        
        lib_utils.msgout('notice',msg)
        if self.logger: self.logger.info(msg)
        
        if reuse and lib_utils.check_if_file_valid(self.bed_fn):
            msg = 'reuse bed file [%s] generated previously @ %s'%(self.bed_fn,job_name)
            lib_utils.msgout('notice',msg)
            if self.logger: self.logger.info(msg)
            return self.bed_fn

        #to get a working directory
        tmp_bed = os.path.join(self.work_dir,'refGene_e%d.bed'%ext_bp)
        
        fp = open(self.refGene_fn,'r')
        fp2= open(tmp_bed,'w')
        for i in fp:
            j=i.rstrip().split('\t')
            chrom = j[2]
            
            for e1,e2 in zip(j[9].split(',')[:-1],j[10].split(',')[:-1]):
                e1_ext=int(e1)-ext_bp
                e2_ext=int(e2)+ext_bp
                fp2.write('%s\t%d\t%d\t%s;%s\n'%(chrom,e1_ext,e2_ext,j[12],j[1]))
        fp2.close()
        fp.close()
        
        self.collapse_bed(tmp_bed,job_name,ext_bp)
        os.unlink(tmp_bed)

        return self.bed_fn
Exemplo n.º 7
0
def gcn_path(entry, section='config'):

    sparser = SafeConfigParser()

    divine_root_dir = os.environ.get('DIVINE')
    if not divine_root_dir:
        raise EnvironmentError("set DIVINE variable properly!")

    config_fn = os.path.join(divine_root_dir, 'gcn', 'config', 'divine.conf')
    if not lib_utils.check_if_file_valid(config_fn):
        raise IOError("check if the configuration file[%s] is valid!" %
                      config_fn)

    sparser.read(config_fn)

    try:
        path2 = sparser.get(section, entry)
        if not path2.startswith('/'):
            path2 = os.path.join(divine_root_dir, path2)
        return path2
    except:
        print 'WARNING:The config file [%s] does not contain an entry [%s] in the section [%s]' % (
            config_fn, entry, section)
        return None
Exemplo n.º 8
0
    def _set_args(self, uargs):
        '''
		-objective: checking input parameters, reading config, and storing user command line
		-input: uargs (args from main())
		-output: class initialization 
		'''
        job_name = '_set_args'
        lib_utils.msgout('notice', 'storing input condition ...', job_name)

        if not uargs.hpo_query_fn and not uargs.vcf:
            raise RuntimeError(
                'either VCF (-v) or query phenotype (-q) file should be provided!'
            )

        # check sanity of the input files
        if uargs.hpo_query_fn:
            if lib_utils.check_if_file_valid(uargs.hpo_query_fn):
                self.hpo_query = uargs.hpo_query_fn
            else:
                raise IOError('check if [%s] is valid' % uargs.hpo_query_fn)

        if uargs.vcf:
            if lib_utils.check_if_file_valid(uargs.vcf):
                self.vcf = uargs.vcf
            else:
                raise IOError('check if [%s] is valid' % uargs.vcf)

        if uargs.capkit in ['SureSelect_V6', 'SeqCapEZ_Exome']:
            self.capkit = uargs.capkit
        else:
            raise RuntimeError("revise capture kit symbol[%s]" % uargs.capkit)

        # check input condition
        if uargs.out_dir is None:
            if self.vcf:
                uargs.out_dir = os.path.join(os.path.dirname(self.vcf),
                                             'divine')
            else:
                uargs.out_dir = os.path.join(os.path.dirname(self.hpo_query),
                                             'divine')

        #create the output directory user specifies
        if uargs.out_dir.endswith('/'):
            uargs.out_dir = uargs.out_dir[:-1]
        self.out_dir = uargs.out_dir
        lib_utils.ensure_dir(self.out_dir)

        #prepare output file name
        self.rank_fn = self._assign_out_fn('rank', 'tsv')

        self.log_dir = os.path.join(self.out_dir, 'logs')
        lib_utils.ensure_dir(self.log_dir)

        msg = 'prepared log directory[%s]  ...' % self.log_dir
        lib_utils.msgout('notice', msg, job_name)

        #prepare loggig handler
        ts = datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y%m%d_%H%M%S')

        FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        logging.basicConfig(filename=os.path.join(self.log_dir, 'divine_%s.log' % ts),\
                  filemode="w", level=logging.DEBUG, format=FORMAT)

        # ------------------------
        self.logger = logging.getLogger('divine')
        # ------------------------
        self.logger.info(msg)

        #read configuration file containing 3rd parties s/w path and database locations
        self._read_config(uargs.vcf_filter_cfg)

        #record user command line
        self.record_commandline()
        msg = 'Divine initialization completed [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)
Exemplo n.º 9
0
	def __init__(self, uargs):
		#transferring user input arguments to class member variables
		
		self.to_delete_fns = []
		self.exp_tag = uargs.exp_tag
		self.vknown = uargs.vknown
		self.cadd = uargs.cadd
		self.top_k_disease = uargs.top_k_disease
		
		self.excl_non_coding = False
		self.sparser = SafeConfigParser()
		
		self.omim = None
		
		self.pheno_dmg = {}
		self.gt_dmg = {}
		self.gene_dmg = {}
		self.vknown_genes = {}
		
		lib_utils.msgout('notice','initializing Divine ...','Divine')
		
		divine_root_dir = os.environ.get("DIVINE")
		if not divine_root_dir:
			raise EnvironmentError("set DIVINE variable properly!")
		
		config_fn = os.path.join(divine_root_dir,'gcn','config','divine.conf')

		if not lib_utils.check_if_file_valid(config_fn):
			raise IOError("check if the configuration file[%s] is valid!" % config_fn)
		
		self.config_fn = config_fn
		self.entries = {'divine_root':divine_root_dir}
		self._set_args(uargs)

		self.hpo_query = uargs.hpo_query
		if self.hpo_query is None:
			self.hpo2disease_fn = None
			self.pheno_dmg_fn = None
			self.disease_rank_fn = None
		else:
			self.hpo2disease_fn = self._assign_out_fn('hpo_to_diseases','tsv')
			self.pheno_dmg_fn = self._assign_out_fn('pheno_gene_rank','tsv')
			self.disease_rank_fn = self._assign_out_fn('diseases_rank','tsv')

		self.gene_rank_fn = self._assign_out_fn('gene_rank', 'tsv')
		self.vcf = uargs.vcf
		self.ped = None
		self.proband_id = None
		self.genotype = True
		
		if self.vcf:
			self.is_family_vcf = False
			if uargs.ped:
				self.is_family_vcf = True
				if uargs.proband_id:
					proband_idx = lib_ped.check_consistency_ped_vcf(\
															self.vcf,uargs.ped,uargs.proband_id)
					self.ped = uargs.ped
					self.proband_id = uargs.proband_id
				else:
					msg = "A family file [%s] was provided but you didn't provide a proband ID to examine. Specify the probrand ID available in the VCF [%s] using an option -p."\
						%(uargs.ped,self.vcf)
					print(msg)
					raise RuntimeError(msg)

			else:
				#get sample_ids contained into VCF file
				v = vcf.VCFParser(self.vcf)
				if len(v.samples) > 1:
					raise RuntimeError('VCF file [%s] contains more than two samples. Let me know which sample is a proband to diagnose!'%self.vcf)
				elif len(v.samples) == 1:
					#search sample_id and create a temp ped for the proband
					self.ped = os.path.join(self.out_dir,'proband_tmp.ped')
					self.proband_id = lib_ped.create_proband_ped(self.vcf,self.ped)
					self.to_delete_fns.append(self.ped)
				else:
					self.genotype = False
		
		self.xls = None
		self.hgmd = uargs.hgmd
		self.cosmic = uargs.cosmic
		self.dblink = uargs.dblink
		
		# damage factor w.r.t the location of variant within the transcript
		self.dm = damaging_model.DmgCoeff(\
			uargs.indel_fidel,uargs.go_seed_k,self.logger)
		
		if uargs.ref_exon_only==1:
			msg = 'VCF is going to be masked by RefGene coding region'
			lib_utils.msgout('notice',msg);self.logger.info(msg)

		self.ref_exon_only = uargs.ref_exon_only

		lib_utils.msgout('notice','done. initialization')
Exemplo n.º 10
0
    def create_bed(self, ext_bp=0, reuse=False):

        job_name = 'RefGeneUcscTB.create_bed'

        self.bed_fn = os.path.join(self.work_dir,
                                   'refGene_e%d_so_merged.bed' % ext_bp)

        msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s' % (
            self.bed_fn, job_name)

        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        if reuse and lib_utils.check_if_file_valid(self.bed_fn):
            msg = 'reuse bed file [%s] generated previously @ %s' % (
                self.bed_fn, job_name)
            lib_utils.msgout('notice', msg)
            if self.logger: self.logger.info(msg)
            return self.bed_fn

        #to get a working directory
        tmp_bed = os.path.join(self.work_dir, 'refGene_e%d.bed' % ext_bp)

        fp = open(self.refGene_fn, 'r')
        fp2 = open(tmp_bed, 'w')
        for i in fp:
            j = i.rstrip().split('\t')
            chrom = j[2]

            for e1, e2 in zip(j[9].split(',')[:-1], j[10].split(',')[:-1]):
                e1_ext = int(e1) - ext_bp
                e2_ext = int(e2) + ext_bp
                fp2.write('%s\t%d\t%d\t%s;%s\n' %
                          (chrom, e1_ext, e2_ext, j[12], j[1]))
        fp2.close()
        fp.close()

        msg = 'sorting bed file ... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp)
        #sort
        lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True,
                                   tmp_so_bed)

        msg = 'merging exon coordinates overlapped each other... @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        #merge boundaries if any overlapped
        fp = open(tmp_so_bed, 'r')
        fp2 = open(self.bed_fn, 'w')

        chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t')
        e1p = int(e1p)
        e2p = int(e2p)

        wrapup = 1
        merge = 2
        fp.seek(0)
        for i in fp:
            chrom, e1, e2, annot = i.rstrip().split('\t')
            e1 = int(e1)
            e2 = int(e2)
            if chrom == chromp:
                if e2p < e1: action = wrapup
                else: action = merge
            else: action = wrapup

            if action == wrapup:
                fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
                chromp, e1p, e2p, annotp = chrom, e1, e2, annot
            elif action == merge:
                if e2p < e2:
                    e2p = e2
                    annotp += '|%s' % annot
        fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp))
        fp.close()
        fp2.close()

        os.unlink(tmp_bed)
        os.unlink(tmp_so_bed)

        msg = 'done. @ %s' % job_name
        lib_utils.msgout('notice', msg)
        if self.logger: self.logger.info(msg)

        return self.bed_fn
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "build GO using fastsemsim [[email protected]]")
    parser.add_argument('--version', action='version', version='%(prog)s 0.2')
    parser.add_argument('-b',
                        action='store',
                        dest='fastsemsim_bin',
                        required=True,
                        help='')
    parser.add_argument('-G',
                        dest='resource_dir',
                        required=True,
                        help='gene ontology resource dir')
    parser.add_argument('-s',
                        action='store',
                        dest='sim_method',
                        required=False,
                        default='SimRel',
                        help='')
    parser.add_argument('-S',
                        action='store',
                        dest='method_id',
                        required=False,
                        default=1,
                        type=int,
                        help='assign an integer number for sim_method[1]')

    parser.add_argument('--prep',
                        action='store_const',
                        dest='prepare_input',
                        required=False,
                        default=False,
                        const=True,
                        help='prepare pairs of uniprot query files[False]')

    parser.add_argument('-q',
                        action='store',
                        dest='upair_fn',
                        required=True,
                        help='')
    parser.add_argument('-o',
                        action='store',
                        dest='out_prefix',
                        required=True,
                        help='')
    parser.add_argument('-n',
                        action='store',
                        dest='ncpu',
                        required=False,
                        type=int,
                        default=1,
                        help='specify the number of cpus to utilize')

    args = parser.parse_args()

    workD = '%s_work' % args.out_prefix
    if not os.path.exists(workD):
        os.makedirs(workD)

    if args.prepare_input:
        cGO = lib_geneontology.Fastsemsim(args.resource_dir, workD,
                                          args.fastsemsim_bin, False)
        if lib_utils.check_if_file_valid(args.upair_fn):
            sys.exit(1)
        cGO.gen_all_pairs_file(args.upair_fn)
        upair_fns = split_file(args.upair_fn, args.ncpu)
        print 'check %s' % upair_fns
        print 'then, run this program again to cal go sim on the splitted input files ...'
        sys.exit(0)
    else:
        #split
        upair_fns = split_file(args.upair_fn, args.ncpu)
        ncpu = len(upair_fns)

    if ncpu > 1:
        work_queue = Queue()

    #start for loop to dispatch jobs //
    out_pyvs = []
    for upair_fn in upair_fns:
        print 'dispatching fastsemsim on [%s] ...' % upair_fn
        out_pyv = '%s_out.pyv' % upair_fn
        input_vec = [
            args.gosim_dir, args.resource_dir, workD, upair_fn,
            args.sim_method, out_pyv, True
        ]
        if ncpu == 1:
            par_go_sim(input_vec)
        else:
            work_queue.put(input_vec)
        out_pyvs.append(out_pyv)
        print 'done'

    print 'job dispatch done. now wait...'

    if ncpu > 1:
        run_par_defs_wait(par_go_sim, work_queue, ncpu)

    #collect outputs via pyv
    merged_funSim_fn, merged_goSim_fn = collect_outputs(
        args.out_prefix, args.sim_method, args.method_id, out_pyvs)

    if True:
        print 'cleanup work_dir files...'
        lib_utils.unlink_fns(upair_fns)
        lib_utils.unlink_fns(out_pyvs)
        #shutil.rmtree(workD)
    print 'done.'