示例#1
0
 def df_list0(self, col_name, sep1=None, sep2=None):
     outlist = []
     inlist = list(self.df[col_name])
     if sep1 is None:
         outlist = list(set(inlist))
     else:
         out_dict = {}
         if sep2 is None: #sep1 only
             for ele1 in inlist:
                 ele1_list = ele1.split(sep1)
                 for item in ele1_list:
                     out_dict[item] = 0
         else:#sep1 and spe2 both
             print('sep12')
             for ele1 in inlist:
                 ele1 = str(ele1)
                 ele1_list = ele1.split(sep1)
                 for ele2 in ele1_list:
                     item = ele2.split(sep2)[0]
                     out_dict[item] = 0
                 #print len(out_dict.keys())
         #unique  items
         #print(out_dict)
         outlist = myList.basic(out_dict.keys()).sort_list()
     return outlist
示例#2
0
 def permute_taxon_blast(self, hits_num):
     print('permutation of viral blast:{}\t{}'.format(self.par['type'], hits_num))
     #
     counts_df = pd.DataFrame()
     outfile = '{}{}.txt'.format(myIO.dir_os(self.par['dir_out']).create_dir(), hits_num)
     if os.path.isfile(outfile):
         print('Read file: ', outfile)
         counts_df = pd.read_csv(outfile, header=0, index_col=0, sep="\t", low_memory=False)  
     else:
         #1: permutated peptides
         pep_names = list(self.par['binary_aln_df'].index)
         pep_df = myList.basic(pep_names).permute_list(self.par['permutation_times'], hits_num)
         #2: permutation based on the non-overlapped hits num
         for col, perm_pep in pep_df.items():
             perm_zb = self.par['binary_aln_df'].ix[perm_pep]
             p_collapse_zb, p_sim_tag = myDataframe.basic(perm_zb).unispecie(self.par['sim_threshold'])
             counts_df[col] = p_collapse_zb.apply(sum,axis=0) + p_sim_tag
             #print list(perm_tmp[col])
         #export
         counts_df.to_csv(outfile, sep='\t', header=True, index_label=self.par['type'])
     #combine permuated counts
     #print counts_df.shape
     perm_mean = counts_df.apply(lambda x: np.mean(np.floor(x)), axis=1).round()
     #print perm_mean
     return perm_mean
示例#3
0
 def df_list(self, col_name, sep1=None, sep2=None):
     outlist = []
     out_dict = {}
     try:
         inlist = list(self.df[col_name])
     except:
         print('No column name in the data frame:', col_name)
     else:
         if sep1 is None:
             outlist = list(set(inlist))
         else:
             #out_dict={}
             if sep2 is None: #sep1 only
                 for ele1 in inlist:
                     ele1 = str(ele1)# avoid null list export
                     ele1_list = ele1.split(sep1)
                     for item in ele1_list:
                         out_dict[item] = 0
             else:#sep1 and spe2 both
                 out_dict = {}
                 for ele1 in inlist:
                     ele1 = str(ele1) # avoid null list export
                     ele1_list = ele1.split(sep1)
                     for ele2 in ele1_list:
                         item = ele2.split(sep2)[0]
                         out_dict[item] = 0
             #unique  items
             outlist = myList.basic(out_dict.keys()).sort_list()
     finally:
         #print outlist
         pass
         return outlist
 def combine_fa(self, fa_dict):
     chrs = myList.basic(fa_dict.keys()).sort_list()
     out_obj = open(self.biofile, 'wt')
     for chr_name in chrs:
         fa_file = fa_dict[chr_name]
         seq = genome(fa_file).read_fa_first()
         out_obj.write('>{}\n{}\n'.format(chr_name, seq))
         print('\t{}:{}'.format(chr_name, fa_file))
     out_obj.close()
     print('Combine fa files into ', self.biofile)
示例#5
0
 def hits_func(x, peps, threshold, pro_id):
     #signficant hits
     hits = x[x >= threshold]
     #non_overlapping peptides
     peps = [str(x) for x in peps]
     hit_peps = [str(x) for x in hits.index]
     none_overlapped_hits_num = myList.basic(peps).un_neighbours(
         hit_peps, return_type='hits_num')
     #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num)
     #if len(hit_peps)>0: print pro_id, peps, hit_peps
     #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps
     return len(list(
         hits.index)), none_overlapped_hits_num, ','.join(hit_peps)
示例#6
0
 def __init__(self, par):
     #data frame
     if 'df' in par.keys():
         self.data = par['df']
         self.nrow = self.data.shape[0]
         self.ncol = self.data.shape[1]
         self.colnames = self.data.columns
         self.xlabel = par['xlabel'] if 'xlabel' in par.keys() else list(
             self.data)[0]
         self.ylabel = par['ylabel'] if 'ylabel' in par.keys() else list(
             self.data)[1]
         self.xlim = par['xlim'] if 'xlim' in par.keys() else myList.basic(
             self.data.ix[:, 0]).min_max()
         self.ylim = par['ylim'] if 'ylim' in par.keys() else myList.basic(
             self.data.ix[:, 1]).min_max()
         #initiate plot window
         plt.clf()
     elif 'list' in par.keys():
         self.data = par['list']
         self.xlabel = par['xlabel'] if 'xlabel' in par.keys() else 'x'
         self.ylabel = par['ylabel'] if 'ylabel' in par.keys() else 'y'
         self.xlim = par['xlim'] if 'xlim' in par.keys() else None
         self.ylim = par['ylim'] if 'ylim' in par.keys() else None
     else:
         self.data = None
         print('Error:No data frame input as long as drawing a plot!')
     #file
     self.picfile = par['picfile'] if 'picfile' in par.keys() else None
     #colors
     self.col = 'bgrcmykw'
     #line styles:solid, dashed, dotted, dashdot
     self.title = par['title'] if 'title' in par.keys() else 'Plot'
     self.text = par['text'] if 'text' in par.keys() else None
     self.legend = par['legend'] if 'legend' in par.keys() else False
     self.pch = par['pch'] if 'pch' in par.keys() else 'o'
     self.lty = par['lty'] if 'lty' in par.keys() else 'solid'
     self.lwd = par['lwd'] if 'lwd' in par.keys() else 1
示例#7
0
    def taxon_blast2(self, file_aln, zscore_file):
        taxon_type = myIO.file_os(file_aln).name_prefix()
        print("\n{}:{}\n".format(taxon_type, zscore_file))
        #read zscore_df
        zdf = myDataframe.basic().standard_df(zscore_file)

        #match order of align score and zscore,replace na
        #read alignment file for specie alignment
        binary_b = myDataframe.basic().aln_df(file_aln,
                                              self.par['align_score'])
        #binary_b = myDataframe.basic(binary_b).filter_aln()
        binary_b = binary_b.reindex(zdf.index).fillna(0)

        #print binary_b

        #sample names in columns, and specie in rows
        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        #perm_df=pep_df.copy()
        #print binary_z.apply(sum, axis=0)
        #n=0
        for sample_name, column in zdf.iteritems():
            #n += 1
            #1: select peptides
            #column=zscore_df.ix[:,20]
            #first remove all nont-hits
            hits = column[
                column >= self.par['specieZ_threshold']].copy()  #all hits
            hits.sort_values(axis=0, ascending=False, inplace=True)
            #print hits
            #remove overlapped hits
            nonoverlap_hits = myList.basic(hits).gen_ind_hits(
                self.par['dependent_pep'])
            input_num = len(nonoverlap_hits)
            print("{}:\thits={}, nonoverlapped={}".format(
                sample_name, len(hits), input_num))

            #2: remove overlap hits between species
            if input_num > 0:
                zb_df = binary_b.loc[nonoverlap_hits.index]
                #print list(binary_b.apply(lambda x: sum(x), axis=0))
                #loop
                collapse_zb, sim_tag, p_series = myDataframe.basic(
                    zb_df).binom_unispecie(self.par['dir_ref_seq'], input_num,
                                           self.par['p_threshold'],
                                           self.par['x_threshold'])
                #counts of hits
                sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
                #all peptide_id list
                pep_df[sample_name] = collapse_zb.apply(
                    lambda x: myList.basic(x).names_string(0.001), axis=0)
                p_df[sample_name] = p_series
                #padjust_df[sample_name]=p_adjust_series
            #if n==5: break
            #n+=1
        #export to file
        file_head = myIO.file_os(
            zscore_file).file_prefix() + '_' + taxon_type + '_'
        #file_head='random_min_HI_HC_'+taxon_type+'_'
        sum_df.to_csv(file_head + 'counting.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        p_df.to_csv(file_head + 'p-values.txt',
                    sep='\t',
                    header=True,
                    index_label='Specie')

        #Adjusted p-values using B-H
        '''
        stats = importr('stats')
        for i in p_df:
            pvalue_list = p_df[i].values
            p_adjust = list(stats.p_adjust(FloatVector(pvalue_list), method = 'BH'))
            padjust_df[i] = p_adjust
        padjust_df.to_csv(file_head+'p-adjusted.txt', sep='\t', header=True, index_label='Specie')
        '''
        padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        for i in p_df.columns:
            pvals = np.array(p_df[i].values)
            if not np.isnan(pvals).all():
                mask = [j for j in np.where(np.isfinite(pvals))[0]]
                pval_corrected = np.empty(pvals.shape)
                pval_corrected.fill(np.nan)
                pval_corrected[mask] = multipletests(pvals[mask],
                                                     method='fdr_bh')[1]
                padjust_df[i] = pval_corrected
        padjust_df.to_csv(file_head + 'padjusted.txt',
                          sep='\t',
                          header=True,
                          index_label='Specie')
        #perm_df.to_csv(file_head+'permutation.txt', sep='\t', header=True, index_label='Specie')


#end
示例#8
0
    def taxon_blast(self, file_aln, zscore_file):
        print(
            '###Signficant taxon by removing overlapped hits based on blast alignment.'
        )
        taxon_type = myIO.file_os(file_aln).name_prefix()
        print('{}: {}'.format(taxon_type, zscore_file))
        #read zscore_df
        zdf = myDataframe.basic().standard_df(zscore_file)

        #match order of align score and zscore,replace na
        #read alignment file for specie alignment
        binary_b = myDataframe.basic().aln_df(file_aln,
                                              self.par['align_score'])
        binary_b = binary_b.reindex(zdf.index).fillna(0)
        #print binary_b

        #sample names in columns, and specie in rows
        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        #perm_df = pep_df.copy()
        #print binary_z.apply(sum, axis = 0)
        #n = 1
        for sample_name, column in zdf.items():
            #1: select peptides
            #column = zscore_df.ix[:,20]
            #first remove all nont-hits
            hits = column[
                column >= self.par['specieZ_threshold']].copy()  #all hits
            hits.sort_values(axis=0, ascending=False, inplace=True)
            #print hits
            #remove overlapped hits
            nonoverlap_hits, overlap_debug = myList.basic(hits).remove_overlap(
                self.par['dependent_pep'])
            input_num = len(nonoverlap_hits)
            print('{}: hits={}, nonoverlapped={}'.format(
                sample_name, len(hits), input_num))

            #2: remove overlap hits between species
            if input_num > 0:
                ###2-1: export peptides
                try:
                    outfile = '{}{}/{}.csv'.format(self.par['dir_result'],
                                                   sample_name, taxon_type)
                    overlap_debug.to_csv(outfile,
                                         header=True,
                                         index_label='peptides')
                except FileNotFoundError:
                    myIO.file_os(self.par['file_err'], "\t").line_replace(
                        {'taxon_blast': sample_name})
                ###2-2: specie-specific hits based on non-overlapped hits
                #sample zscore-alignscore matrix times by zscore
                #print(nonoverlap_hits.index)
                zb_df = binary_b.ix[nonoverlap_hits.index]
                #print(list(binary_b.apply(lambda x: sum(x), axis = 0)))
                #loop
                collapse_zb, sim_tag = myDataframe.basic(zb_df).unispecie(
                    self.par['sim_threshold'])
                #counts of hits
                sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
                #print(list(sum_df[sample_name]))
                #high_sum = sum_df[sample_name]
                #print(high_sum[high_sum>0])
                #all peptide_id list
                pep_df[sample_name] = collapse_zb.apply(
                    lambda x: myList.basic(x).names_string(0.001), axis=0)

                #2-3:permutation
                #perm_df[sample_name] = self.specie_alignment_permutation(input_num)
            #if n == 10: break
            #n+ = 1
        #export to file
        file_head = '{}_{}_'.format(
            myIO.file_os(zscore_file).file_prefix(), taxon_type)
        sum_df.to_csv(file_head + 'counting.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
示例#9
0
 def count_reads(self):
     #key is ref name, value is reads string sep by comma, the first is ref seq
     unique_seq = dict((a, []) for a in self.par['ref_dict'].keys()) 
     #unique and multiple counts in dict
     unique = {} #key is ref name, value is counts
     multiple = {} # key is query name, value is the list of refs
     num = {}# counts statistics
     saturation = {0:{1:0, 5:0, 10:0, 'max':0 }} # count number for saturation analysis
     last_index = 0
     
     print('\tread sam file: {}.gz'.format(self.par['sample_sam_file']))
     IN = gzip.open(self.par['sample_sam_file']+'.gz', 'rt')
     UN = gzip.open(self.par['sample_dir']+self.par['sample_name']+'_unknown.fa.gz', 'wt')
     maxRC = 0
     for line in IN:
         #print(line)
         #counts
         num['raw_reads_num'] = num.setdefault('raw_reads_num',0)+1
         #analyze sam line
         info = self.analyze_SAM(line)
         qname, ref= info['qname'], info['ref']
         #unique alignment
         if info['aligned'] == '1':
             unique[ref] = unique.setdefault(ref,0) + 1
             if unique[ref] > maxRC: maxRC = unique[ref]
             #counting of saturation
             if unique[ref] in [1,5,10]:
                 last_counts = saturation[last_index].copy()# copy() is essential!!!!!
                 last_counts[unique[ref]] += 1
                 last_counts['max'] = maxRC#the maximum RC at the time of raw reads we get
                 saturation[num['raw_reads_num']] = last_counts
                 #print num['raw_reads_num'], last_index, saturation[num['raw_reads_num']]
                 last_index = num['raw_reads_num']
             #export aligned sequences of reads
             unique_seq[ref].append(info['seq'])
             num['unique_aligned_reads_num'] = num.setdefault('unique_aligned_reads_num',0)+1
         #multiple alignment
         elif info['aligned'] == '3':
             multiple[qname] = multiple[qname] + [ref] if qname in multiple else [ref]
             num['multialigned_reads_num'] = num.setdefault('multialigned_reads_num',0)+1
         #unalignment
         else:
             UN.write('>'+qname+'\n'+info['seq']+'\n')
             num['unaligned_reads_num'] = num.setdefault('unaligned_reads_num',0) + 1
     IN.close()
     UN.close()
     #counting of saturation
     if num['raw_reads_num'] > last_index:
         saturation[num['raw_reads_num']] = saturation[last_index].copy()
     #for key in sorted(saturation.keys()):
     #    print key, saturation[key]
     
     #upate num statistics
     myIO.file_os(self.par['sample_log'], '=').line_add(num)
     
     print('\tcombine RCs from unique and multiple alignments of ', self.par['sample_name'])
     #reversed multiple
     #print multiple
     rev_multiple = myDict.basic(multiple).counting_reversed_dict()
     #print unique
     RC_dict = self.multiple_counts(unique, rev_multiple)
     #export
     print('\tSave read counts into ', self.par['sample_RC_file'])
     myDict.basic(RC_dict).dict2_to_file(self.par['sample_RC_file'], pattern='\t')
     myDict.basic(saturation).dict2_to_file(self.par['sample_saturation_file'], pattern='\t')
     #
     seq_counts = {}
     for ref, reads_list in unique_seq.items():
         key=ref+'\t'+self.par['ref_dict'][ref]+'\t'+str(len(reads_list))
         if len(reads_list)>0:
             freq_dict = myList.basic(reads_list).elements_frequency0()
             seq_counts[key] = ';'.join(str(a)+':'+str(b) for a,b in freq_dict.items())
         else:
             seq_counts[key] = 'NA'
     myDict.basic(seq_counts).dict_to_file(self.par['sample_dir']+'unique_aligned_reads.txt', pattern='\t')
示例#10
0
 def __init__(self, dictionary=None, dict_keys=None):
     self.dict = dictionary
     if isinstance(self.dict, dict):
         self.dict_keys = myList.basic(self.dict.keys()).sort_list(
         ) if dict_keys is None else dict_keys
     self.out_dict = {}
示例#11
0
 def __init__(self, dictionary=None):
     self.dict = dictionary
     if isinstance(self.dict, dict):
         self.sorted_keys = myList.basic(self.dict.keys()).sort_list()
     self.out_dict = {}
示例#12
0
    def QC_saturation(self):
        print("###saturation analysis\n")
        combined_df = {}
        combined_dynamics = {}
        #plot suaturation curve per sample
        #n=1
        for sample_name in self.par['sample_names']:
            file_head = '{}{}/'.format(self.par['dir_result'], sample_name)
            #read saturation file
            df = pd.read_table(file_head + 'QC_saturation.txt',
                               sep="\t",
                               index_col=False)
            #print list(df)
            #print list(df.index)

            #saturation curves
            saturation_df = df[['row_name', '1', '5', '10']]
            #shrink dict
            shrinked_index = myList.basic(list(
                saturation_df.index)).interval_list()
            #print shrinked_index
            sample_df = saturation_df.ix[shrinked_index]  #select rows
            #sample_df=sample_df.transpose().astype(float)
            sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6
            #print sample_df
            #scatter plot
            plot_par = {
                'df': sample_df,
                'legend': 'upper left',
                'title': 'Saturation analysis (Sequencing depth)',
                'picfile': file_head + 'QC_saturation_analysis.png',
                'xlabel': 'Number of raw reads (million)',
                'ylabel': 'Number of references'
            }
            myPlot.plot(plot_par).lineP()
            #combine data frame
            sample_df.index = range(sample_df.shape[0])
            for cutoff in ['1', '5', '10']:
                sub_df = sample_df[['row_name', cutoff]].copy()
                sub_df.columns = ['raw_reads:' + sample_name, sample_name]
                if cutoff in combined_df:
                    combined_df[cutoff] = pd.merge(combined_df[cutoff],
                                                   sub_df,
                                                   left_index=True,
                                                   right_index=True,
                                                   how='outer')
                else:
                    combined_df[cutoff] = sub_df.copy()

            #dynamics analysis
            dynamics_df = df[['row_name', 'max']]  #select df
            #shrink dict
            shrinked_index = myList.basic(list(
                dynamics_df.index)).interval_list()
            sample_df = dynamics_df.ix[shrinked_index]  #select rows
            sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6  #divided by millions
            sample_df.reset_index(drop=True, inplace=True)
            #combined
            combined_dynamics[sample_name] = sample_df
            #plot
            plot_par = {
                'df': sample_df,
                'legend': 'upper left',
                'title': 'Saturation analysis:dynamics of read conts',
                'picfile': file_head + 'QC_read_counts_dynamics.png',
                'xlabel': 'Number of raw reads (million)',
                'ylabel': 'Maximum read counts'
            }
            myPlot.plot(plot_par).lineP()
        #export saturated curves
        for cutoff in ['1', '5', '10']:
            plot_par = {
                'df':
                combined_df[cutoff],
                'legend':
                None,
                'title':
                'samples={}, RC-cutoff={}'.format(
                    len(self.par['sample_names']), cutoff),
                'picfile':
                '{}saturation_cuttoff_{}.png'.format(self.par['dir_QC'],
                                                     cutoff),
                'xlabel':
                'Number of raw reads (million)',
                'ylabel':
                'Number of references'
            }
            myPlot.plot(plot_par).lineP(x_value=1)

        #export dynamics curves
        combined_dynamics = pd.concat(combined_dynamics, axis=1)
        combined_dynamics.columns = [
            ':'.join(x) for x in list(combined_dynamics)
        ]
        #print combined_dynamics.shape
        #print combined_dynamics
        plot_par = {
            'df':
            combined_dynamics,
            'legend':
            None,
            'title':
            'Sequencing depth,sample={}'.format(len(self.par['sample_names'])),
            'picfile':
            '{}saturation_dynamics.png'.format(self.par['dir_QC']),
            'xlabel':
            'Number of raw reads (million)',
            'ylabel':
            'Maximum read counts'
        }
        myPlot.plot(plot_par).lineP(x_value=1)