def match_fasta(self): files = myIO.dir_os(self.par['dir_out']).incrusive_files() #select a fasta file fa_files = filter(lambda x: x.endswith(('.fa', '.fasta')), files) self.par['match_fa'] = mySystem.system().select_key(fa_files) #select a gtf or gff file gtf_files = filter(lambda x: x.endswith(('.gtf', '.gff3')), files) self.par['match_gtf'] = mySystem.system().select_key(gtf_files) #match if par['web_site'] == 'ENSEML': myGenome.genome(par['match_fa']).match_ensembl_fa(par['match_gtf']) elif par['web_site'] == 'NCBI': myGenome.genome(par['match_fa']).match_ncbi_fa(par['match_gtf'])
def init_analysis(self): #1: read annotation file if 'file_annotation' in self.par.keys(): self.par['annot_df'] = myDataframe.basic().annot_df( self.par['file_annotation']) #genome annotation: associations of protein-peptides self.par['dict_pro_pep'] = myCommon.basic( self.par).protein_peptides() #virus only if 'VirScan' in self.par['file_annotation']: #extract aa stretch #get dependent petides that two peptides shared at least 7-aa. self.par['dependent_pep'] = myCommon.basic( self.par).taxon_dependent_peptides() #2: check bowtie or build bowtie index myAlign.alignment(self.par).build_bowtie_index() #3: sample info self.par = myParallel.samples(self.par).export_sample_info() #samples of negative controls group1 = self.par['group1'] if 'NC' in group1.keys(): self.par['NC_samples'] = group1['NC'].split(',') self.par['phip_samples'] = list( set(self.par['sample_names']) - set(self.par['NC_samples'])) print('\nNumber of negative Controls (Beads only): ', self.par['NC_samples'].__len__()) print('Number of PhIP samples: ', self.par['sample_names'].__len__()) #myDict.basic(self.par['sample_dirs']).print_dict() #read reference sequence file (*.fa) ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa() self.par['ref_dict'] = ref_dict
def init_RCdict(self): RC_dict = {} #get all ref names from the refseq file ref_names = myGenome.genome(self.par['file_ref_fa']).fa_displayid() for ref in ref_names: RC_dict[ref] = {'lowRC':0, 'midRC':0, 'highRC':0} # return RC_dict
def download_dna(self): #get html lines = web(self.url['dna_fa']).get_html() chr_files = self.dna_files(lines) #download and decompress genome files local_chr_files = {} for key in chr_files.keys(): #release version self.ver = re.sub(r"_chr.*", '', chr_files[key]) url = self.url['dna_fa']+chr_files[key] gz_file = myIO.file_os(url).download(self.out_dir) #decompress file #ungz_file=myIO.file_os(gz_file).decompress_gz() local_chr_files[key] = gz_file #combine fa files out_file = ''.join([self.out_dir, self.ver,'_dna.fa']) #print out_file myGenome.genome(out_file).combine_fa(local_chr_files) return local_chr_files, out_file
def download_dna(self): url = self.url['dna_fa'] #get genome files #get html lines = web(url).get_html() chr_files = self.dna_files(lines) #download and decompress genome files local_chr_files = {} for key in chr_files.keys(): self.ver = re.sub(r"\.chromosome.*", '', chr_files[key]) gz_file = myIO.file_os(url+chr_files[key]).download(self.out_dir) #decompress file #ungz_file=myIO.file_os(gz_file).decompress_gz() local_chr_files[key] = gz_file #combine fa files out_file = self.out_dir+self.ver+'.fa' #print out_file myGenome.genome(out_file).combine_fa(local_chr_files) return local_chr_files, out_file
def main_loop(self): print("\n\n####Parameters of PHIP: \n") #parallel procesing if self.par['phip_alignment'] == 'yes' or self.par[ 'phip_counting'] == 'yes': sample_names = self.par['sample_names'] print(sample_names.__len__(), ' samples will be analyzed.\n') #multi-threads #myCommon.basic(self.par).pp_map_threads(self.phipseq_alignment, sample_names) #multi-processes myCommon.basic(self.par).pp_map_process(mp_alignment, [(self, s) for s in sample_names]) #combine RC and statistics file if self.par['phip_merge'] == 'yes': pep_names = myGenome.genome(self.par['file_ref_fa']).read_fa()[1] #1: combine RC files into RC matrix print('\n\n\n###Combine RC files (phip_merge)\n') #get arguments args_list = [] RC_level = 'lowRC' #peptide level: lowRC out_file = self.par['files_dict']['pep_RC'] arg_tuple = ('_RC.txt', RC_level, out_file, pep_names) args_list.append(arg_tuple) if 'file_annotation' in self.par: #promax level out_file = self.par['files_dict']['promax_RC'] arg_tuple = ('_pro_maxRC.txt', RC_level, out_file, None) args_list.append(arg_tuple) #prosum level out_file = self.par['files_dict']['prosum_RC'] arg_tuple = ('_pro_sumRC.txt', RC_level, out_file, None) args_list.append(arg_tuple) #multi-threads myCommon.basic(self.par).pp_map_threads( myAlign.alignment(self.par).combine_countfiles, args_list) #myCommon.basic(self.par).pp_apply_threads(args_list) #2: generate statistics.csv myCommon.basic(self.par).QC_statistics() #significance analysis using Z score if self.par['phip_zscores'] == 'yes': print('\n\n\n###normalization of RC (phip_zscores)\n') #peptides level RC_file = self.par['files_dict']['pep_RC'] #infile #1: scaling RCs sRC_file = self.par['files_dict']['pep_scalingRC'] # outfile myStat.normalization(self.par, RC_file, sRC_file, 'pep_id').RC_scaling() #2: z-scores of scaling RCs against negative controls and phipseq samples zfile = self.par['files_dict']['pep_NCPHIPzscores'] #outfile if 'file_NC' in self.par.keys(): myStat.normalization(self.par, sRC_file, zfile, 'pep_id').NCPHIPzscores_PN() else: myStat.normalization(self.par, sRC_file, zfile, 'pep_id').NCPHIPzscores_RLM() #3:collpase matrix if 'file_annotation' in self.par: print("\t######collapse peptide matrix into protein matrix") pars = [] for name in ['scalingRC', 'NCPHIPzscores']: pep_file = self.par['files_dict']['pep_' + name] #infile sum_file = self.par['files_dict']['pep_' + name + '_prosum'] #outfile pars.append((pep_file, sum_file, sum)) max_file = self.par['files_dict']['pep_' + name + '_promax'] #outfile pars.append((pep_file, max_file, max)) #multiple-threading myCommon.basic(self.par).pp_map_threads( myCommon.basic(self.par).collapse_matrix, pars) #Functional analysis after normalization and correction #parallel processing print('\n\n\n###Functional Analysis (phip_GP and phip_enrichment)\n') pool = mpd.Pool(processes=self.par['threads_num']) #set the list of parameters pep_zfile = self.par['files_dict']['pep_NCPHIPzscores'] #infile promax_zfile = self.par['files_dict']['pep_NCPHIPzscores_promax'] prosum_zfile = self.par['files_dict']['pep_NCPHIPzscores_prosum'] if self.par['phip_GP'] == 'yes': #1: polyclonal of signficant peptides pool.apply_async(self.sig_polyclonal, args=(pep_zfile, )) #virus only if 'VirScan' in self.par['file_annotation']: #5: inter/intra specie searching only for virus library##### pool.apply_async(self.taxon_spec, args=( pep_zfile, 'phip_taxon', 'pep_id', )) #6: specie alignment of virus only file_aln = self.par['dir_ref_seq'] + 'specie_blast.txt' pool.apply_async(self.taxon_blast, args=( file_aln, pep_zfile, )) #7: organism alignment of virus only file_aln = self.par['dir_ref_seq'] + 'organism_blast.txt' pool.apply_async(self.taxon_blast, args=( file_aln, pep_zfile, )) ##quality control #1: relationship between significant hits and raw read num pool.apply_async(myCommon.basic(self.par).QC_hits, args=(pep_zfile, )) pool.apply_async(myCommon.basic(self.par).QC_hits, args=(prosum_zfile, )) pool.apply_async(myCommon.basic(self.par).QC_hits, args=(promax_zfile, )) #2:saturation analysis pool.apply_async(myCommon.basic(self.par).QC_saturation) if self.par['phip_enrichment'] == 'yes': #5:Detection of enriched protein motifs E = myCommon.basic(self.par) if 'pro_motifs' in list(self.par['annot_df']): pool.apply_async(E.enrich_pro, args=( pep_zfile, 'pep_id', 'pro_motifs', ';', ',', )) #6:GO,loci,PPI,KEGG,InterPro, multifunctional scaffold protein enrichment analysis terms = set([ 'GO', 'map', 'PPI', 'KEGG', 'InterPro', 'MIM', 'autoantigen' ]) & set(list(self.par['annot_df'])) for term in terms: pro = self.par['protein_assoc'] pool.apply_async(E.enrich_pro, args=( prosum_zfile, pro, term, ',', None, )) pool.apply_async(E.enrich_pro, args=( promax_zfile, pro, term, ',', None, )) pool.close() pool.join()
sys.exit(2) #no return ############################## if __name__ == "__main__": #home_dir=os.path.expanduser("~")+'/' #get parameters from command line par = par_command(sys.argv) #judge parameters judge_par(par) #combine index files if par['I1_file'] and par['I2_file'] if os.path.isfile(par['I1_file']) and os.path.isfile(par['I2_file']): myGenome.genome(par['I1_file']).cbind_fq(par['I2_file'], par['index_file']) #demultiplexing: split fastq files based on barcode if par['multiplexing_mode'] == 3: print('The splited FASTQ files are stored into {}'.format( par['dir_raw_data'])) myGenome.genome(par['fq_file']).demultiplex_fq(par) #trim fastq files if 'fq_files' in par.keys(): for fq in par['fq_files']: myGenome.genome(fq).trim_fq(par['dir_raw_data'], par['seq_start'], par['seq_end']) #generate sample_info file under result dir: if par['out'] != 'NA':
############################## if __name__ == "__main__": #home_dir=os.path.expanduser("~")+'/' #get parameters from command line par = par_command(sys.argv) #judge parameters judge_par(par) #combine index files if par['index_file'] or par['I1_file'] and par['I2_file'] #if os.path.isfile(par['I1_file']) and os.path.isfile(par['I2_file']): # myGenome.genome(par['I1_file']).cbind_fq(par['I2_file'], par['index_file']) #demultiplexing: split fastq files based on barcode if os.path.isfile(par['index_file']): myGenome.genome(par['fq_file']).decompose_fq(par) elif os.path.isfile(par['I1_file']) and os.path.isfile(par['I2_file']): myGenome.genome(par['fq_file']).decompose_fq2(par) #trim fastq files if 'fq_files' in par.keys(): for fq in par['fq_files']: myGenome.genome(fq).trim_fq(par['dir_raw_data'], par['seq_start'], par['seq_end']) #generate sample_info file under result dir: if par['out'] != 'NA': #current dir par['dir_bin'] = os.path.dirname(os.path.realpath(__file__)) + '/' par['dir_home'] = os.path.abspath(os.path.join(par['dir_bin'], os.pardir)) + '/' print('Home directory of phip pipsline: ', par['dir_home'])