def aln_df(self, infile, align_score): print("\tRead align file ", infile) file_sep = '\t' if myIO.file_os(infile).name_suffix() == 'txt' else ',' aln_df = myIO.file_os(infile, file_sep).flat_file_to_df([0,1,11], True) #print aln_df #convert to binary matrix binary_b = pd.DataFrame(np.where(aln_df >= align_score, 1, 0)) #print np.sum(binary_b) binary_b.index = [str(x).split('_')[1] for x in list(aln_df.index)] binary_b.columns = [re.sub(',', ';', str(x)) for x in list(aln_df)] binary_b.fillna(0, inplace=True) #print binary_b.ix['Rnl2_SPIKEIN'] #print list(binary_b.index)[:10] ''' ##remove subset species, of which alignment score <1 #calculate shared probes taxon_sums = binary_b.apply(sum, axis=0) taxons = list(binary_b) shared_prob=pd.DataFrame(index=taxons, columns=taxons) for taxon, col in binary_b.iteritems(): shared_prob[taxon]=binary_b.apply(lambda x, y=col: np.dot(y,x), axis=0)/taxon_sums shared_prob[taxon][taxon]=0 # tags_df=pd.DataFrame(index=taxons, columns=taxons) for taxon, col in shared_prob.iteritems(): row=shared_prob.loc[taxon] tags_df[taxon]=[1 if a!=1 and b==1 else 0 for a, b in zip(col,row)] sum_tags=tags_df.apply(max, axis=0) reserved_taxons=list(sum_tags[sum_tags==0].index) binary_b=binary_b[reserved_taxons] ''' return binary_b
def match_ensembl_fa(self, gtf_file): #get order of chr ref_arr = self.fa_displayid() #print ref_arr #export matched gtf file_head = myIO.file_os(self.biofile).file_prefix() out_gtf = file_head + '.' + myIO.file_os(gtf_file).name_suffix() out_obj = open(out_gtf, 'wt') #out_gtf_file print('Match the first column of {} with {}, => {}'.format( gtf_file, self.biofile, out_gtf)) for chr_id in ref_arr: n = 0 #read gtf in_obj = self.readonly_handle(self.gtf_file) for line in in_obj: if not line.startswith('#'): items = line.split("\t") seqid = items[0] if chr_id == seqid: out_obj.write(line) n += 1 in_obj.close() print('{}:{}'.format(chr_id, n)) out_obj.close()
def match_ncbi_fa(self, gtf_file): #read matched information from gtf_file seqid_chr = genome(gtf_file).read_ncbi_gff('seqid', 'chromosome') #out_gtf_file file_head = myIO.file_os(self.biofile).file_prefix() out_gtf = file_head + '.' + myIO.file_os(gtf_file).name_suffix() out_obj = open(out_gtf, 'wt') #get order of chr ref_arr = self.fa_displayid() #export matched gtf print('Match the first column of {} with {}, => {}'.format( gtf_file, self.biofile, out_gtf)) for chr_id in ref_arr: chromosome = re.sub('chr', '', chr_id, flags=re.IGNORECASE) n = 0 #read gtf in_obj = self.readonly_handle(self.biofile) for line in in_obj: if not line.startswith('#'): items = line.split("\t") seqid = items[0] if seqid in seqid_chr and seqid_chr[seqid] == chromosome: items[0] = chr_id myline = "\t".join(items) out_obj.write(myline) n += 1 in_obj.close() print('{}:{}'.format(chr_id, n)) out_obj.close()
def collapse_matrix(self, pars): infile, outfile, collapse_func=pars print('t:', outfile) #read counts_file: pep_df file_sep = '\t' if myIO.file_os(infile).name_suffix() == 'txt' else ',' pep_df = pd.read_csv(infile, header=0, index_col=0, sep=file_sep, low_memory=False) #pep_df.index=pep_df.index.astype(str) sample_names = list(pep_df) #print(sample_names) #both column and row names should be string type pep_df.columns = pep_df.columns.astype(str) pep_df.index = pep_df.index.astype(str) #combined pep_df with protein annotation pep_pro = self.par['annot_df'][['pep_id', 'pro_id']] #pep_pro['pep_id']=pep_pro['pep_id'].astype(str) combined_df = pd.merge(pep_df, pep_pro, how='inner', left_index=True, right_on='pep_id') #group by protein id group_dict = combined_df.groupby(['pro_id'], as_index=False).groups collapse = dict() for protein_id, row_names in group_dict.items(): subdf = pep_df.ix[row_names] collapse[protein_id] = subdf.apply(collapse_func, axis=0) #if protein_id =='A0A126': #print subdf[['CTLA4.BEADS_ONLY.BEADS_ONLY.BEADS_ONLY.20A20G.1', 'CTLA4.BEADS_ONLY.BEADS_ONLY.BEADS_ONLY.20A20G.2']] #convert to data frame and transpose cdf = pd.DataFrame(collapse).transpose() #remove the first column cdf = np.round(cdf[sample_names], 1) #reorder by row names cdf=cdf.loc[self.par['pro_ids']] #print cdf #export file_sep = '\t' if myIO.file_os(outfile).name_suffix() == 'txt' else ',' cdf.to_csv(outfile, index_label='pro_id', sep=file_sep) return cdf
def phipseq_alignment(self, sample_name): print('\n######Anslysis of {} will be trigerred!#####'.format( sample_name)) #initiate sample par sample_var = dict(self.par) sample_var['start_time'] = time.time() #sample name sample_var['sample_name'] = sample_name #sample directory sample_dir = self.par['sample_dirs'][sample_name] sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir() print('\tSample directory: ', sample_var['sample_dir']) #raw data sample_var['sample_raw_files'] = ','.join( sample_var['sample_to_raw'][sample_name]) print('\tRaw files: ', sample_var['sample_raw_files']) #export sample_var['file_head'] = sample_var['sample_dir'] + sample_name #default same file sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam' #file of read counts sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt' sample_var['sample_pro_sumRC_file'] = sample_var[ 'file_head'] + '_pro_sumRC.txt' sample_var['sample_pro_maxRC_file'] = sample_var[ 'file_head'] + '_pro_maxRC.txt' #file for saturation analysis sample_var['sample_saturation_file'] = sample_var[ 'file_head'] + '_saturation.txt' #sample log sample_var['sample_log'] = sample_var['file_head'] + '.log' #sequence alignment if sample_var['phip_alignment'] == 'yes': print("\n###sequence alignment", sample_var['tool_aligner']) #output is sam file if sample_var['tool_aligner'] == 'bowtie1': myAlign.alignment(sample_var).bowtie1_alignment() #counts reads if sample_var['phip_counting'] == 'yes': #RC matrix by peptides myAlign.alignment(sample_var).count_reads() #RC matrix by proteins if 'file_annotation' in self.par.keys(): self.combine_peptides(sample_var) #update sample log sample_times = mySystem.system().get_time(sample_var['start_time']) sample_times['sample_name'] = sample_name myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
def download_annot(self,genome_type): #print genome_type url = self.url[genome_type] #get html and the list of files url_dir, url_files = web(url).ls_html() #print url_files #download and decompress genome files local_chr_files = {} for file_url in url_files.keys(): gz_file = myIO.file_os(file_url).download(self.out_dir) #decompress file ungz_file = myIO.file_os(gz_file).decompress_gz() local_chr_files[file_url]=ungz_file return local_chr_files
def file_to_samples(self): #get all fastq files raw_files = self.seek_fq(self.par['dir_raw_data']) print('Number of raw files:', len(raw_files)) #read sample info file print('Read sample file: ', self.par['file_sample_info']) in_obj = open(self.par['file_sample_info'], 'rt') #set connections between raw data and sample_name for line in in_obj: line = line.rstrip("\n") items = line.split(',') raw_file_name = items[0] sample_name = items[1] #print prefix for raw_file in raw_files: file_name = myIO.file_os(raw_file).file_name() #print file_name if file_name.find(raw_file_name) == 0: #dict: raw_sample self.raw_sample[raw_file] = sample_name #dict: sample_raw if sample_name in self.sample_raw: self.sample_raw[sample_name].append(raw_file) else: self.sample_raw[sample_name] = [raw_file] in_obj.close()
def protein_peptides(self): pro_pep = {} #read annotation file annot_dict = myIO.file_os(self.par['file_annotation'], "\t").to_dict2() if 'Rnl2_SPIKEIN' in annot_dict: annot_dict['Rnl2_SPIKEIN']['pep_rank'] = 0 in_pro = [annot_dict[p]['pro_id'] for p in annot_dict.keys()] in_pro = list(set(in_pro)) print('In proteins:{}, In peptides:{}'.format(in_pro.__len__(), annot_dict.keys().__len__())) ## pro_rank_pep = {} for pep_id in self.par['pep_ids']: pro_id = annot_dict[pep_id]['pro_id'] pep_rank = annot_dict[pep_id]['pep_rank'] pep_rank = int(pep_rank) if isinstance(pep_rank, int) else 0 if pro_id in pro_rank_pep: pro_rank_pep[pro_id][pep_id] = pep_rank else: pro_rank_pep[pro_id] = {pep_id:pep_rank} #print pro_rank_pep[pro_id] # pep_num = 0 for pro_id, pep_dict in pro_rank_pep.items(): #print sorted(pep_dict.keys()) peps = sorted(pro_rank_pep[pro_id], key = pro_rank_pep[pro_id].get) pep_num += len(peps) pro_pep[pro_id] = ','.join(peps) #export print("Number of protein:{}\tNumber of peptides:{}.".format(len(pro_pep.keys()), pep_num)) myDict.basic(pro_pep, self.par['pro_ids']).dict_to_file(self.par['file_pro_pep'], "\t") # return pro_pep
def download_annot(self,genome_type): #print genome_type url = self.url[genome_type] #get html lines = web(url).get_html() #get the list of files chr_files = self.single_file(lines) print(chr_files) #download and decompress genome files local_chr_files = {} for key in chr_files.keys(): gz_file = myIO.file_os(url+chr_files[key]).download(self.out_dir) #decompress file ungz_file = myIO.file_os(gz_file).decompress_gz() local_chr_files[key] = ungz_file return local_chr_files
def shrink_fq(self, par): #read relationship between barcode vs sample from sample_file barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict() #print barcode_sample #file handle m = 0 n = 0 F1 = self.readonly_handle(self.biofile) out1 = open(self.biofile + '.shrink', 'wt') F2 = self.readonly_handle(par['index_file']) out2 = open(par['index_file'] + '.shrink', 'wt') with F1, F2: #read 4 lines at a time per file for L1, La, L2, Lb, L3, Lc, L4, Ld in itertools.zip_longest( *[F1, F2] * 4): barcode = Lb.rstrip() #assign record based on barcode if barcode in barcode_sample.keys(): #output file handle out1.writelines([L1, L2, L3, L4]) out2.writelines([La, Lb, Lc, Ld]) m += 1 n += 1 F1.close() F2.close() out1.close() out2.close() print("{}->{}({})".format(n, m, m / n))
def QC_statistics(self): print("###Quality control: statistics summary") #print(self.par['sample_names']) #print(self.par['dir_result']) stat_dict = collections.defaultdict(dict) for sample_name in self.par['sample_names']: sample_log = '{}{}/{}.log'.format(self.par['dir_result'], sample_name, sample_name) stat_dict[sample_name] = myIO.file_os(sample_log, '=').to_dict() #convert to data frame stat_df = pd.DataFrame(stat_dict) stat_df = stat_df.transpose() #1: scatter plot 1 sub_df = stat_df[['raw_reads_num', 'unique_aligned_reads_num']].astype(float)/1e6 #print sub_df plot_par={'df':sub_df, 'title':'raw_reads_vs_aligned_reads', 'picfile':self.par['dir_QC'] + 'raw_reads_vs_aligned_reads.png', 'pch':'o', 'text':'million reads'} myPlot.plot(plot_par).dotP() #2: scatter plot 2 stat_df['unique_aligned_percentage'] = sub_df['unique_aligned_reads_num']*100/sub_df['raw_reads_num'] plot_par['df'] = stat_df[['raw_reads_num','unique_aligned_percentage']].astype(float) plot_par['title'] = 'percentage_aligned_reads' plot_par['picfile'] = self.par['dir_QC'] + 'percentage_aligned_reads.png' myPlot.plot(plot_par).dotP() #3: export to csv file print('\tSave statistical summary into {}.'.format(self.par['file_stat'])) stat_df.to_csv(self.par['file_stat'], index_label='sample_names')
def sig_polyclonal(self, count_file): #count_file = args_tuple print("Polyclonal analysis of ", count_file) comb_df, pep_df = myCommon.basic(self.par).combine_df(count_file) #functions def hits_func(x, peps, threshold, pro_id): #signficant hits hits = x[x >= threshold] #non_overlapping peptides peps = [str(x) for x in peps] hit_peps = [str(x) for x in hits.index] none_overlapped_hits_num = myList.basic(peps).un_neighbours( hit_peps, return_type='hits_num') #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num) #if len(hit_peps)>0: print pro_id, peps, hit_peps #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps return len(list( hits.index)), none_overlapped_hits_num, ','.join(hit_peps) #collapse by protein hits1 = {} hits2 = {} #n = 1 for pro_id, row_index in comb_df.groupby('pro_id').groups.items(): #row is protein id ##get protein-peptides annotations peps_str = self.par['dict_pro_pep'][pro_id] peps = peps_str.split(',') #df by protein sub_df = pep_df.ix[row_index] #print("{}\t{}".format(pro_id, list(sub_df.index)) ) #hits num beyond zscore threshold hits_num = sub_df.apply(hits_func, axis=0, args=(peps, self.par['zscore_threshold'], pro_id)) #if pro_id == 'Q9YLJ1': print hits_num #all number of significant hits num1 = [h[0] for h in hits_num] hits1[pro_id] = dict(zip(list(sub_df), list(num1))) #number of sig hits without overlapping num2 = [h[1] for h in hits_num] hits2[pro_id] = dict(zip(list(sub_df), list(num2))) #if (np.sum(num1))>10: #pd.set_option('display.max_columns', None) #pd.set_option('display.max_rows', None) #print np.matrix(np.round(sub_df)) #print num1 #print num2 #n+ = 1 #if n == 10: break #export file_head = myIO.file_os(count_file).file_prefix() + '_polyclonal' myDict.basic(hits1, self.par['pro_ids']).dict2_to_file( file_head + '.txt', "\t") myDict.basic(hits2, self.par['pro_ids']).dict2_to_file( file_head + '_nonoverlapped.txt', "\t")
def annot_df(self, infile): file_sep = '\t' if myIO.file_os(infile).name_suffix() == 'txt' else ',' annot_df = pd.read_csv(infile, header=0, index_col=None, sep=file_sep, low_memory=False) #bother column and row names should be string type annot_df.index = annot_df['pep_id'] annot_df.index = annot_df.index.astype(str) #annot_df.columns=self.par['annot_df'].columns.astype(str) return annot_df
def __init__(self, biofile=None, sep=None): self.biofile = biofile #seperate character if sep is None: self.sep = ',' if myIO.file_os( self.biofile).name_suffix() == 'csv' else "\t" else: self.sep = sep self.record_num = 0
def extract_annot(self, left_column, right_column, FUN): annot_dict = {} #read annotation file annot_df = myIO.file_os(self.par['file_annotation'], sep="\t").to_df(header=True, rowname=False) for index, row in annot_df.iterrows(): key = row[left_column] value = FUN(row[right_column]) annot_dict[key] = value #print "%s:%s" % (key, value) return annot_dict
def QC_hits(self, infile, threshold=None): print('###Relationship between significant hits and raw read num of ', infile) file_prefix = '{}{}_'.format(self.par['dir_QC'], myIO.file_os(infile).name_prefix()) if threshold is None: threshold = float(self.par['zscore_threshold']) #read statistics file stat_df = pd.read_table(self.par['file_stat'], sep=",", index_col=0, low_memory=False) stat_df.index = stat_df['sample_name'] #assign row names stat_df = stat_df.ix[self.par['sample_names']]#order rows by sample_names raw_reads = stat_df['raw_reads_num']/1e6 #print stat_df[['sample_name','raw_reads_num']] #read values file in_df = pd.read_table(infile, sep="\t", index_col=0, low_memory=False)#rownames and colnames order_df = in_df[self.par['sample_names']].copy()#order columns #print(order_df.shape) #plot of raw reads vs number of hits #print list(order_df) def func1(x,y=threshold): sig = x[x>=y] return len(sig) hits_num = order_df.apply(func1, axis=0) #get compared df comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_num}) comp_df.to_csv(file_prefix+'raw_vs_sighits.csv', sep=',') #plot plot_par={'df':comp_df, 'legend':None, 'title': 'Effects of sequencing depth on significant hits', 'picfile': file_prefix + 'raw_vs_sighits.png', 'xlabel':'Number of raw reads (million)', 'ylabel':'Number of signficant hits'} myPlot.plot(plot_par).dotP() #plot of raw reads vs mean values of hits #print list(order_df) def func2(x,y=threshold): x = pd.Series(x) #print list(x) sig = x[x>=y] #print list(sig) sig_mean = np.mean(sig) return sig_mean hits_mean = order_df.apply(func2, axis=0) #print hits_mean #get compared df comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_mean}) outfile=file_prefix+'raw_vs_mean_significant_hits.csv' print('\texport QC to {}.'.format(outfile)) comp_df.to_csv(outfile, sep=',') #plot plot_par={'df':comp_df, 'legend':None, 'title': 'Effects of sequencing depth on significant hits', 'picfile': file_prefix + 'raw_vs_mean_significant_hits.png', 'xlabel':'Number of raw reads (million)', 'ylabel':'Mean values of signficant hits'} myPlot.plot(plot_par).dotP()
def normalize_by_factor(self, annot_file, col_index, col_factor='pro_len'): #read RC file #RC_df=pd.read_table(self.biofile, sep=self.sep, index_col=True) RC_df = myIO.file_os(self.biofile, sep=self.sep).to_df(header=True, rowname=True) #print(RC_df.shape) #read annotation file #annot_df=pd.read_table(annot_file, sep="\t", index_col=True) annot_df = myIO.file_os(annot_file, sep="\t").to_df(header=True, rowname=True) #print list(annot_df) sub_annot = annot_df[[col_index, col_factor]].drop_duplicates([col_index], take_last=True) sub_annot.index = list(sub_annot[col_index]) sub_annot = sub_annot.ix[:, 1:] #remove the column with col_index #print sub_annot #sort annot_df by row names of RC_df sub_annot = sub_annot.ix[list(RC_df.index)] #for missing proteins, pro_len equal ave-pro_len ave_pro_len = np.mean(sub_annot[col_factor]) pro_len_df = sub_annot.fillna(ave_pro_len) #print(pro_len_df.shape) #normalization by aa length of proteins RC_df.insert(0, col_factor, list(pro_len_df[col_factor])) normRC_df = RC_df.apply(lambda x: x[1:] / x[0], axis=1) #scaling normalization by million reads def norm_func(x): sum_x = np.sum(x) norm_x = x * 10e6 / sum_x if sum_x > 0 else x norm_x = np.round(norm_x) norm_x = norm_x.astype(int) return norm_x normRC_df = normRC_df.apply(norm_func, axis=0) #print normRC_df return normRC_df
def standard_df(self, infile, fill=True): #read txt file sep = ',' if myIO.file_os(infile).name_suffix() == 'csv' else '\t' stand_df = pd.read_table(infile,header=0, index_col=0,sep=sep,low_memory=False) #string of row names stand_df.index = [str(x) for x in list(stand_df.index)] stand_df.columns = [str(x) for x in list(stand_df)] #replace NAN if fill == True: stand_df.fillna(0, inplace=True) return stand_df
def NC_whole_std(self): print('\tPolynomial regression of std~median across ALL BEADS-ONLY.') file_prefix = '{}{}_'.format(self.par['dir_result'], myIO.file_os(self.par['file_NC']).name_prefix()) norm_ncfile = file_prefix+'scalingRC.txt' if os.path.isfile(norm_ncfile): phip_nc = pd.read_csv(norm_ncfile, sep='\t', index_col=0, low_memory=False) else: phip_nc = normalization(self.par, self.par['file_NC'], norm_ncfile).RC_scaling() #print(phip_nc.shape) #summary of nc: mean and std NC=pd.DataFrame({'mean':phip_nc.mean(axis=1), 'median':phip_nc.median(axis=1), \ 'std':phip_nc.std(axis=1), 'sum':phip_nc.sum(axis=1)}) NC['median'][NC['median']==0] = np.nan NC['std'][NC['std']==0] = np.nan NC['logmedian'] = np.log10(NC['median']) NC['logstd'] = np.log10(NC['std']) #NC=NC.replace([np.inf, -np.inf], -10) #an extreme small value # #initiate reg_df for regression #fill out outliers reg_df = NC.loc[(NC['median']>0),:].copy() #order for polynomial regression reg_df = reg_df.sort_values(['logmedian'], ascending=True) #polynomial regression formula = 'logstd~logmedian+I(logmedian**2)+I(logmedian**3)' pn_model = smf.ols(formula, data=reg_df) pn_fit = pn_model.fit() #print(pn_fit.params) reg_df['pred_logstd'] = pn_fit.predict() reg_df['pred_std'] = 10**pn_fit.predict() NC['pred_logstd'] = pn_fit.predict({'logmedian':NC['logmedian']}) NC['pred_std'] = 10**NC['pred_logstd'] #refresh total log #params=dict(pn_fit.params) #NC_dict = dict([('polynomial_NC_std:' + x, params[x]) for x in params.keys()]) #myIO.file_os(self.par['file_total_log'], '=').line_replace(NC_dict) #export fitting of std NC.to_csv(file_prefix+'polynomial_std.csv', header=True, index_label='row_names') #draw graph xm=round(np.nanmax(list(NC['logmedian']))) ym=round(np.nanmax(list(NC['logstd']))) plot_par={'df': NC[['logmedian','logstd']], 'xlim':(-.5,xm), 'ylim':(-.5,ym),\ 'picfile':file_prefix+'polynomial_std.png', 'text':pn_fit.params } try: myPlot.plot(plot_par).regressionP(reg_df['logmedian'], reg_df['pred_logstd']) except ValueError: print('Failed to drawing pic and save into {}'.format(plot_par['picfile'])) #return fitting model object return NC, pn_fit
def init_aligner_par(self): if self.par['tool_aligner'] == 'bowtie1': self.par['bowtie_aligner'] = self.par['dir_aligner'] + 'bowtie' self.par['bowtie_builder'] = self.par['dir_aligner'] + 'bowtie-build' elif self.par['tool_aligner'] == 'bowtie2': self.par['bowtie_aligner'] = self.par['dir_aligner'] + 'bowtie2' self.par['bowtie_builder'] = self.par['dir_aligner'] + 'bowtie2-build' #print self.par['bowtie_aligner'], self.par['bowtie_builder'] #bowtie index self.par['bowtie_index_name'] = myIO.file_os(self.par['file_ref_fa']).file_prefix() self.par['bowtie_index'] = self.par['dir_aligner'] + self.par['bowtie_index_name']
def export_df(self, outfile, threshold=10, index_label='row_names'): print('\texport data frame to ', outfile) outsep = ',' if outfile.endswith('.csv') else '\t' self.df.to_csv(outfile, sep=outsep, index_label=index_label) #draw a scatterplot counts = self.df.apply(lambda x, y=threshold: len(x[x>=y]), axis=0) #print counts plot_par={'list':counts, 'ylabel':'Sample_names', 'xlabel':'Number of hits', 'picfile': myIO.file_os(outfile).file_prefix()+'.png', 'title': 'Number of hits, threshold='+str(threshold) } myPlot.plot(plot_par).simple_barh()
def raw_to_samples(self): #get all fastq files raw_files = self.seek_fq(self.par['dir_raw_data']) #print raw_files #connect raw file to sample name for raw_file in raw_files: sample_name = myIO.file_os(raw_file).name_prefix() self.raw_sample[raw_file] = sample_name if sample_name in self.sample_raw: self.sample_raw[sample_name].append(raw_file) else: self.sample_raw[sample_name] = [raw_file]
def sample_info(self): sample_pairs = {} for raw_file, sample_name in self.raw_sample.items(): raw_file_name = myIO.file_os(raw_file).file_name() group = 'NC' if 'BEADS' in raw_file_name.upper() else 'PhIP' if not 'unassigned' in raw_file_name: sample_name = re.sub('_R1', "", sample_name) pair = '{},{}'.format(raw_file_name, sample_name) sample_pairs[pair]=group #export dict to file print('Generate sample file: ', self.par['file_sample_info']) #order per record: fastq file name, sample_name, phip_group myDict.basic(sample_pairs).dict_to_file(self.par['file_sample_info'], ',')
def combine_df(self, counts_file, annot_index='pep_id'): #read count file file_sep = '\t' if myIO.file_os( counts_file).name_suffix() == 'txt' else ',' counts_df = pd.read_table(counts_file, sep=file_sep, index_col=0, low_memory=False) counts_df.index = [str(x) for x in counts_df.index] #print 'counts:', counts_df.shape #print list(counts_df.index)[:20] #read annotation file file_sep = '\t' if myIO.file_os( self.par['file_annotation']).name_suffix() == 'txt' else ',' annot_df = pd.read_table(self.par['file_annotation'], sep=file_sep, index_col=None, low_memory=False) annot_df.index = [str(x) for x in annot_df[annot_index]] #print 'annot:', annot_df.shape #print list(annot_df[annot_index])[:20] #combine by rows comb_df = pd.merge(annot_df, counts_df, left_index=True, right_index=True, how='inner') comb_df.index = list(comb_df[annot_index]) #comb_df=comb_df.rename(columns={self.par['protein_assoc']:'pro_id'}) #print comb_df[['pep_id','row_name']] #print comb_df.shape #sample df sample_df = comb_df[self.par['sample_names']] sample_df.index = list(comb_df[annot_index]) return (comb_df, sample_df)
def download_idmapping(self): #get web file list url_idmapping = self.url+'knowledgebase/idmapping/by_organism/' web_dir, web_files = web(url_idmapping).ls_html() #print web_files #select file file_names = filter(lambda x: '.dat.' in x, web_files.values()) file_names.sort() file_name = mySystem.system().select_key(file_names, 'Select web file') #download idmapping dat file url_file = url_idmapping + file_name local_file = self.out_dir + file_name web(url_file).download_file(local_file) #decompress file ungz_file = myIO.file_os(local_file).decompress_gz() print('Save ', url_file, ' as ', ungz_file) return ungz_file
def download_dna(self): url = self.url['dna_fa'] #get genome files #get html lines = web(url).get_html() chr_files = self.dna_files(lines) #download and decompress genome files local_chr_files = {} for key in chr_files.keys(): self.ver = re.sub(r"\.chromosome.*", '', chr_files[key]) gz_file = myIO.file_os(url+chr_files[key]).download(self.out_dir) #decompress file #ungz_file=myIO.file_os(gz_file).decompress_gz() local_chr_files[key] = gz_file #combine fa files out_file = self.out_dir+self.ver+'.fa' #print out_file myGenome.genome(out_file).combine_fa(local_chr_files) return local_chr_files, out_file
def download_dna(self): #get html lines = web(self.url['dna_fa']).get_html() chr_files = self.dna_files(lines) #download and decompress genome files local_chr_files = {} for key in chr_files.keys(): #release version self.ver = re.sub(r"_chr.*", '', chr_files[key]) url = self.url['dna_fa']+chr_files[key] gz_file = myIO.file_os(url).download(self.out_dir) #decompress file #ungz_file=myIO.file_os(gz_file).decompress_gz() local_chr_files[key] = gz_file #combine fa files out_file = ''.join([self.out_dir, self.ver,'_dna.fa']) #print out_file myGenome.genome(out_file).combine_fa(local_chr_files) return local_chr_files, out_file
def combine_countfiles(self, args_tuple): #row_names should be None or list type infile_tail, RC_level, out_file, row_names = args_tuple # counting_dict2 = {} for sample_name in self.par['sample_names']: #get read counts of a given sample counting_file = '{}{}/{}{}'.format(self.par['dir_result'], sample_name, sample_name, infile_tail) sample_dict2 = myIO.file_os(counting_file, '\t').to_dict2() for ref in sample_dict2.keys(): #print ref counts = sample_dict2[ref][RC_level] if ref in counting_dict2: counting_dict2[ref].update({sample_name:counts}) #print '=='+ref+'==' else: counting_dict2[ref] = {sample_name:counts} #print sample_name, ref,counting_dict2[ref] #export counting_dict myDict.basic(counting_dict2).dict2_to_file(out_file=out_file, row_names=row_names)
def trim_fq(self, outdir, seq_start=0, seq_end=0): file_name = myIO.file_os(self.biofile).file_name() outfile = outdir + re.sub('\.gz$', '', file_name) print("Trim fastq files {}, and save new file {}\n".format( self.biofile, outfile)) #get file handles of the two fastq files, and the output file F1 = self.readonly_handle(self.biofile) out_obj = open(outfile, 'wt') with F1: #read 4 lines at a time per file for L1, L2, L3, L4 in zip(*[F1] * 4): if seq_start > 0: L2 = L2[seq_start:] L4 = L4[seq_start:] #trim the longer reads from 3-end if seq_end != 0: L2 = L2.rstrip() L4 = L4.rstrip() L2 = L2[:seq_end] + "\n" L4 = L4[:seq_end] + "\n" #export to the output file out_obj.writelines([L1, L2, L3, L4])
'/home/yuan/results_phip', '/home-4/[email protected]/work/yuan/results_phip' ] #get all variables.txt files_var = [] for d in dirs: if os.path.isdir(d): files_formula = os.path.join(d, '*' + file_type + '*/variables.txt') #print files_formula files_var += glob.glob(files_formula) #sub=myIO.dir_os(d).recrusive_files('variables.txt') #for s in sub: # if file_type in s: files_var.append(s) #get all command lines for index, file_var in enumerate(files_var): print(index + 1, file_var) #revise the parameters of variables.txt myIO.file_os(file_var, '=').line_replace(par) #parallel processing #threads number #pool=mpd.Pool(processes=8) #pass one argument at a time #pool.map(phip_thread, files_var) #pool.close() #pool.join() print('\n\n\n\nGreat! The batch running is done!\n\n\n') #end