def vannotate(self, reuse=False): ''' objective: run varant (GCN) annotator input: self.vcf output: annotated vcf ''' job_name = 'vannotate' msg = 'annotating VCF file[%s;%s] ...' % (job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) # prepare output file varant_vcf = os.path.join(self.out_dir, 'divine.vcf') # if necessary, masking the raw vcf file coding_vcf = None if self.ref_exon_only > 0: if not lib_utils.check_if_file_valid(varant_vcf) or not reuse: cRef = annotateRegion.RefGeneUcscTB(work_dir=self.out_dir, logger=self.logger) coding_bed_fn = cRef.create_bed(ext_bp=20, reuse=False) msg = 'extracting variants in coding region from [%s] @ %s ...' % ( self.vcf, job_name) lib_utils.msgout('notice', msg) self.logger.info(msg) coding_vcf = os.path.join(self.out_dir, 'refgene_e20.vcf') self.vcf = vcf_mask.by_bed(self.vcf, coding_bed_fn, coding_vcf, logger=self.logger) msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) if not lib_utils.check_if_file_valid(varant_vcf) or not reuse: self.logger.info('annotating [%s,%s] ...' % (job_name, self.vcf)) cmd = ["python", self.entries['varant'], \ "-i", self.vcf, \ "-o", varant_vcf, \ "-l", self.log_dir] if self.capkit: cmd.extend(["-c", self.capkit, "-e", "180"]) if self.hgmd > 0: cmd.extend(["--hgmd"]) self.run_cmd(cmd, job_name) self.vcf = varant_vcf if coding_vcf: os.unlink(coding_vcf) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def split_file(file2, ncpu, min_num_lines=100): print 'splitting [%s] into %d pieces' % (file2, ncpu) D, fname, fbase, fext = lib_utils.separateDirFn2(file2) out_prefix = '%s/%s_parts' % (D, fbase) #compute the number of lines per a splitted file N = lib_utils.count_num_lines(file2) L = round(1. * N / ncpu) + 1 if L <= min_num_lines: file2b = '%s_00' % out_prefix print 'the input file size is too small to split.' shutil.copyfile(file2, file2b) return [file2b] else: cmd = 'split -d -l %d %s %s' % (L, file2, out_prefix) lib_utils.gen_msg_time('cmd', cmd, 'split_file') #debug upair_fns = [] for j in range(ncpu): part_fn = '%s%02d' % (out_prefix, j) if lib_utils.check_if_file_valid(part_fn): upair_fns.append(part_fn) else: lib_utils.gen_msg_time('error', 'file[%s] does not exist' % part_fn, 'split_file') print 'done.' return upair_fns
def __init__(self, uargs): #transferring user input arguments to class member variables self.exp_tag = uargs.exp_tag self.vknown = uargs.vknown self.cadd = uargs.cadd self.excl_non_coding = False self.sparser = SafeConfigParser() self.pheno_dmg = {} self.genetic_dmg = {} self.gene_dmg = {} self.hpo2disease_fn = None self.pheno_dmg_fn = None self.hpo_query = None self.vcf = None self.xls = None self.hgmd = uargs.hgmd lib_utils.msgout('notice', 'initializing Divine ...', 'Divine') divine_root_dir = os.environ.get("DIVINE") if not divine_root_dir: raise EnvironmentError("set DIVINE variable properly!") config_fn = os.path.join(divine_root_dir, 'gcn', 'config', 'divine.conf') if not lib_utils.check_if_file_valid(config_fn): raise IOError("check if the configuration file[%s] is valid!" % config_fn) self.config_fn = config_fn self.entries = {'divine_root': divine_root_dir} self._set_args(uargs) # damage factor w.r.t the location of variant within the transcript self.dm = damaging_model.DmgCoeff(uargs.indel_mode, uargs.seed_rate, self.logger) if uargs.ref_exon_only == 1: msg = 'VCF will be masked by RefGene coding region' lib_utils.msgout('notice', msg) self.logger.info(msg) self.ref_exon_only = uargs.ref_exon_only lib_utils.msgout('notice', 'done. initialization')
def _read_config(self,vcf_filter_cfg=None): ''' objective: read configuration file ''' job_name = '_read_config' msg = 'reading configuration file [%s;%s] ...'%(job_name,self.config_fn) lib_utils.msgout('notice',msg);self.logger.info(msg) self.sparser.read(self.config_fn) self._set_config('program_paths', 'varant') self._set_config('program_paths', 'hposim') self._set_config('program_paths', 'vcf2xls') self._set_config('config', 'temp_dir') if not vcf_filter_cfg: self._set_config('config', 'vcf_filter_conf') else: if os.path.exists(vcf_filter_cfg): self.entries['vcf_filter_conf'] = vcf_filter_cfg else: raise RuntimeError('check if the file [%s] is valid'%vcf_filter_cfg) self._set_config('database', 'ext_disease_to_gene') self._set_config('database', 'disease_desc') self._set_config('database', 'hpo_obo') self._set_config('database', 'beta_fit') self._set_config('database', 'string_link') ''' to access to UCSC mysql database(hg19) select e2g.value, gtp.protein from ensGtp as gtp inner join ensemblToGeneName as e2g on e2g.name=gtp.transcript; ''' self._set_config('database', 'esp_to_gene') self._set_config('database', 'kegg_hsa') # check if the file or directory all exists before long journey! for key, path2 in self.entries.iteritems(): if not lib_utils.check_if_file_valid(path2): raise IOError('check [%s = %s] in the file [%s]' %\ (key, path2, self.config_fn)) msg = 'done. [%s]' % job_name lib_utils.msgout('notice',msg);self.logger.info(msg) return self.entries
def annotate_comphet_inherit(self, reuse=False): if not self.ped: return msg = "for a multi-sample VCF containing parent genotypes, append inheritance model" lib_utils.msgout('notice', msg); self.logger.info(msg) # to build database for transcript exon regions if not exist refgene_tx_fn = fileconfig.FILECONFIG['REFGENE'] refgene_tx_fn_dir = os.path.dirname(os.path.abspath(refgene_tx_fn)) genmod_db_dir = os.path.join(refgene_tx_fn_dir, 'genmod_db') if not os.path.exists(genmod_db_dir): os.makedirs(genmod_db_dir) if not os.path.exists(os.path.join(genmod_db_dir, 'genes.db')) or \ not os.path.exists(os.path.join(genmod_db_dir, 'exons.db')): cmd = ["genmod", "build", \ "-t", "gene_pred", \ "--splice_padding", "2", \ "-o", genmod_db_dir, \ refgene_tx_fn] job_name = "annotate_inheritance.genmod_build_db" lib_utils.runcmd2(cmd, self.log_dir, self.logger, job_name) vcf_genmod_out = lib_utils.file_tag2(self.vcf, 'genmod', '') cmd = ["genmod", "annotate", \ "-r", \ self.vcf, \ "|", \ "genmod", "models", \ "-", \ "--family_file", self.ped, \ "-o", vcf_genmod_out] job_name = "annotate_inheritance.genmod_model" msg = "annotating inheritance model into VCF ..." lib_utils.msgout('notice', msg); self.logger.info(msg) if not reuse or not lib_utils.check_if_file_valid(vcf_genmod_out): lib_utils.runcmd2(cmd, self.log_dir, self.logger, job_name) msg = "Done." lib_utils.msgout('notice', msg); self.logger.info(msg) self.vcf = vcf_genmod_out
def create_bed(self, ext_bp=0, reuse=False): job_name = 'RefGeneUcscTB.create_bed' self.bed_fn = os.path.join(self.work_dir,'refGene_e%d_so_merged.bed'%ext_bp) msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s'%(self.bed_fn,job_name) lib_utils.msgout('notice',msg) if self.logger: self.logger.info(msg) if reuse and lib_utils.check_if_file_valid(self.bed_fn): msg = 'reuse bed file [%s] generated previously @ %s'%(self.bed_fn,job_name) lib_utils.msgout('notice',msg) if self.logger: self.logger.info(msg) return self.bed_fn #to get a working directory tmp_bed = os.path.join(self.work_dir,'refGene_e%d.bed'%ext_bp) fp = open(self.refGene_fn,'r') fp2= open(tmp_bed,'w') for i in fp: j=i.rstrip().split('\t') chrom = j[2] for e1,e2 in zip(j[9].split(',')[:-1],j[10].split(',')[:-1]): e1_ext=int(e1)-ext_bp e2_ext=int(e2)+ext_bp fp2.write('%s\t%d\t%d\t%s;%s\n'%(chrom,e1_ext,e2_ext,j[12],j[1])) fp2.close() fp.close() self.collapse_bed(tmp_bed,job_name,ext_bp) os.unlink(tmp_bed) return self.bed_fn
def gcn_path(entry, section='config'): sparser = SafeConfigParser() divine_root_dir = os.environ.get('DIVINE') if not divine_root_dir: raise EnvironmentError("set DIVINE variable properly!") config_fn = os.path.join(divine_root_dir, 'gcn', 'config', 'divine.conf') if not lib_utils.check_if_file_valid(config_fn): raise IOError("check if the configuration file[%s] is valid!" % config_fn) sparser.read(config_fn) try: path2 = sparser.get(section, entry) if not path2.startswith('/'): path2 = os.path.join(divine_root_dir, path2) return path2 except: print 'WARNING:The config file [%s] does not contain an entry [%s] in the section [%s]' % ( config_fn, entry, section) return None
def _set_args(self, uargs): ''' -objective: checking input parameters, reading config, and storing user command line -input: uargs (args from main()) -output: class initialization ''' job_name = '_set_args' lib_utils.msgout('notice', 'storing input condition ...', job_name) if not uargs.hpo_query_fn and not uargs.vcf: raise RuntimeError( 'either VCF (-v) or query phenotype (-q) file should be provided!' ) # check sanity of the input files if uargs.hpo_query_fn: if lib_utils.check_if_file_valid(uargs.hpo_query_fn): self.hpo_query = uargs.hpo_query_fn else: raise IOError('check if [%s] is valid' % uargs.hpo_query_fn) if uargs.vcf: if lib_utils.check_if_file_valid(uargs.vcf): self.vcf = uargs.vcf else: raise IOError('check if [%s] is valid' % uargs.vcf) if uargs.capkit in ['SureSelect_V6', 'SeqCapEZ_Exome']: self.capkit = uargs.capkit else: raise RuntimeError("revise capture kit symbol[%s]" % uargs.capkit) # check input condition if uargs.out_dir is None: if self.vcf: uargs.out_dir = os.path.join(os.path.dirname(self.vcf), 'divine') else: uargs.out_dir = os.path.join(os.path.dirname(self.hpo_query), 'divine') #create the output directory user specifies if uargs.out_dir.endswith('/'): uargs.out_dir = uargs.out_dir[:-1] self.out_dir = uargs.out_dir lib_utils.ensure_dir(self.out_dir) #prepare output file name self.rank_fn = self._assign_out_fn('rank', 'tsv') self.log_dir = os.path.join(self.out_dir, 'logs') lib_utils.ensure_dir(self.log_dir) msg = 'prepared log directory[%s] ...' % self.log_dir lib_utils.msgout('notice', msg, job_name) #prepare loggig handler ts = datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d_%H%M%S') FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(filename=os.path.join(self.log_dir, 'divine_%s.log' % ts),\ filemode="w", level=logging.DEBUG, format=FORMAT) # ------------------------ self.logger = logging.getLogger('divine') # ------------------------ self.logger.info(msg) #read configuration file containing 3rd parties s/w path and database locations self._read_config(uargs.vcf_filter_cfg) #record user command line self.record_commandline() msg = 'Divine initialization completed [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
def __init__(self, uargs): #transferring user input arguments to class member variables self.to_delete_fns = [] self.exp_tag = uargs.exp_tag self.vknown = uargs.vknown self.cadd = uargs.cadd self.top_k_disease = uargs.top_k_disease self.excl_non_coding = False self.sparser = SafeConfigParser() self.omim = None self.pheno_dmg = {} self.gt_dmg = {} self.gene_dmg = {} self.vknown_genes = {} lib_utils.msgout('notice','initializing Divine ...','Divine') divine_root_dir = os.environ.get("DIVINE") if not divine_root_dir: raise EnvironmentError("set DIVINE variable properly!") config_fn = os.path.join(divine_root_dir,'gcn','config','divine.conf') if not lib_utils.check_if_file_valid(config_fn): raise IOError("check if the configuration file[%s] is valid!" % config_fn) self.config_fn = config_fn self.entries = {'divine_root':divine_root_dir} self._set_args(uargs) self.hpo_query = uargs.hpo_query if self.hpo_query is None: self.hpo2disease_fn = None self.pheno_dmg_fn = None self.disease_rank_fn = None else: self.hpo2disease_fn = self._assign_out_fn('hpo_to_diseases','tsv') self.pheno_dmg_fn = self._assign_out_fn('pheno_gene_rank','tsv') self.disease_rank_fn = self._assign_out_fn('diseases_rank','tsv') self.gene_rank_fn = self._assign_out_fn('gene_rank', 'tsv') self.vcf = uargs.vcf self.ped = None self.proband_id = None self.genotype = True if self.vcf: self.is_family_vcf = False if uargs.ped: self.is_family_vcf = True if uargs.proband_id: proband_idx = lib_ped.check_consistency_ped_vcf(\ self.vcf,uargs.ped,uargs.proband_id) self.ped = uargs.ped self.proband_id = uargs.proband_id else: msg = "A family file [%s] was provided but you didn't provide a proband ID to examine. Specify the probrand ID available in the VCF [%s] using an option -p."\ %(uargs.ped,self.vcf) print(msg) raise RuntimeError(msg) else: #get sample_ids contained into VCF file v = vcf.VCFParser(self.vcf) if len(v.samples) > 1: raise RuntimeError('VCF file [%s] contains more than two samples. Let me know which sample is a proband to diagnose!'%self.vcf) elif len(v.samples) == 1: #search sample_id and create a temp ped for the proband self.ped = os.path.join(self.out_dir,'proband_tmp.ped') self.proband_id = lib_ped.create_proband_ped(self.vcf,self.ped) self.to_delete_fns.append(self.ped) else: self.genotype = False self.xls = None self.hgmd = uargs.hgmd self.cosmic = uargs.cosmic self.dblink = uargs.dblink # damage factor w.r.t the location of variant within the transcript self.dm = damaging_model.DmgCoeff(\ uargs.indel_fidel,uargs.go_seed_k,self.logger) if uargs.ref_exon_only==1: msg = 'VCF is going to be masked by RefGene coding region' lib_utils.msgout('notice',msg);self.logger.info(msg) self.ref_exon_only = uargs.ref_exon_only lib_utils.msgout('notice','done. initialization')
def create_bed(self, ext_bp=0, reuse=False): job_name = 'RefGeneUcscTB.create_bed' self.bed_fn = os.path.join(self.work_dir, 'refGene_e%d_so_merged.bed' % ext_bp) msg = 'creating a bed file[%s] containing RefGene coding region (cmpl/incmpl/unk) @ %s' % ( self.bed_fn, job_name) lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) if reuse and lib_utils.check_if_file_valid(self.bed_fn): msg = 'reuse bed file [%s] generated previously @ %s' % ( self.bed_fn, job_name) lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) return self.bed_fn #to get a working directory tmp_bed = os.path.join(self.work_dir, 'refGene_e%d.bed' % ext_bp) fp = open(self.refGene_fn, 'r') fp2 = open(tmp_bed, 'w') for i in fp: j = i.rstrip().split('\t') chrom = j[2] for e1, e2 in zip(j[9].split(',')[:-1], j[10].split(',')[:-1]): e1_ext = int(e1) - ext_bp e2_ext = int(e2) + ext_bp fp2.write('%s\t%d\t%d\t%s;%s\n' % (chrom, e1_ext, e2_ext, j[12], j[1])) fp2.close() fp.close() msg = 'sorting bed file ... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) tmp_so_bed = os.path.join(self.work_dir, 'refGene_e%d_so.bed' % ext_bp) #sort lib_utils.sort_tsv_by_col2(tmp_bed, [1, 2, 3], ['V', 'n', 'n'], True, tmp_so_bed) msg = 'merging exon coordinates overlapped each other... @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) #merge boundaries if any overlapped fp = open(tmp_so_bed, 'r') fp2 = open(self.bed_fn, 'w') chromp, e1p, e2p, annotp = fp.next().rstrip().split('\t') e1p = int(e1p) e2p = int(e2p) wrapup = 1 merge = 2 fp.seek(0) for i in fp: chrom, e1, e2, annot = i.rstrip().split('\t') e1 = int(e1) e2 = int(e2) if chrom == chromp: if e2p < e1: action = wrapup else: action = merge else: action = wrapup if action == wrapup: fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) chromp, e1p, e2p, annotp = chrom, e1, e2, annot elif action == merge: if e2p < e2: e2p = e2 annotp += '|%s' % annot fp2.write('%s\t%d\t%d\t%s\n' % (chromp, e1p, e2p, annotp)) fp.close() fp2.close() os.unlink(tmp_bed) os.unlink(tmp_so_bed) msg = 'done. @ %s' % job_name lib_utils.msgout('notice', msg) if self.logger: self.logger.info(msg) return self.bed_fn
def main(): parser = argparse.ArgumentParser( description= "build GO using fastsemsim [[email protected]]") parser.add_argument('--version', action='version', version='%(prog)s 0.2') parser.add_argument('-b', action='store', dest='fastsemsim_bin', required=True, help='') parser.add_argument('-G', dest='resource_dir', required=True, help='gene ontology resource dir') parser.add_argument('-s', action='store', dest='sim_method', required=False, default='SimRel', help='') parser.add_argument('-S', action='store', dest='method_id', required=False, default=1, type=int, help='assign an integer number for sim_method[1]') parser.add_argument('--prep', action='store_const', dest='prepare_input', required=False, default=False, const=True, help='prepare pairs of uniprot query files[False]') parser.add_argument('-q', action='store', dest='upair_fn', required=True, help='') parser.add_argument('-o', action='store', dest='out_prefix', required=True, help='') parser.add_argument('-n', action='store', dest='ncpu', required=False, type=int, default=1, help='specify the number of cpus to utilize') args = parser.parse_args() workD = '%s_work' % args.out_prefix if not os.path.exists(workD): os.makedirs(workD) if args.prepare_input: cGO = lib_geneontology.Fastsemsim(args.resource_dir, workD, args.fastsemsim_bin, False) if lib_utils.check_if_file_valid(args.upair_fn): sys.exit(1) cGO.gen_all_pairs_file(args.upair_fn) upair_fns = split_file(args.upair_fn, args.ncpu) print 'check %s' % upair_fns print 'then, run this program again to cal go sim on the splitted input files ...' sys.exit(0) else: #split upair_fns = split_file(args.upair_fn, args.ncpu) ncpu = len(upair_fns) if ncpu > 1: work_queue = Queue() #start for loop to dispatch jobs // out_pyvs = [] for upair_fn in upair_fns: print 'dispatching fastsemsim on [%s] ...' % upair_fn out_pyv = '%s_out.pyv' % upair_fn input_vec = [ args.gosim_dir, args.resource_dir, workD, upair_fn, args.sim_method, out_pyv, True ] if ncpu == 1: par_go_sim(input_vec) else: work_queue.put(input_vec) out_pyvs.append(out_pyv) print 'done' print 'job dispatch done. now wait...' if ncpu > 1: run_par_defs_wait(par_go_sim, work_queue, ncpu) #collect outputs via pyv merged_funSim_fn, merged_goSim_fn = collect_outputs( args.out_prefix, args.sim_method, args.method_id, out_pyvs) if True: print 'cleanup work_dir files...' lib_utils.unlink_fns(upair_fns) lib_utils.unlink_fns(out_pyvs) #shutil.rmtree(workD) print 'done.'