예제 #1
0
 def aln_df(self, infile, align_score):
     print("\tRead align file ", infile)
     file_sep = '\t' if myIO.file_os(infile).name_suffix() == 'txt' else ','
     aln_df = myIO.file_os(infile, file_sep).flat_file_to_df([0,1,11], True)
     #print aln_df
     #convert to binary matrix
     binary_b = pd.DataFrame(np.where(aln_df >= align_score, 1, 0))
     #print np.sum(binary_b)
     binary_b.index = [str(x).split('_')[1] for x in list(aln_df.index)]
     binary_b.columns = [re.sub(',', ';', str(x)) for x in list(aln_df)]
     binary_b.fillna(0, inplace=True)
     #print binary_b.ix['Rnl2_SPIKEIN']
     #print list(binary_b.index)[:10]
     '''
     ##remove subset species, of which alignment score <1
     #calculate shared probes
     taxon_sums = binary_b.apply(sum, axis=0)
     taxons = list(binary_b)
     shared_prob=pd.DataFrame(index=taxons, columns=taxons)
     for taxon, col in binary_b.iteritems():
         shared_prob[taxon]=binary_b.apply(lambda x, y=col: np.dot(y,x), axis=0)/taxon_sums
         shared_prob[taxon][taxon]=0
     #
     tags_df=pd.DataFrame(index=taxons, columns=taxons)
     for taxon, col in shared_prob.iteritems():
         row=shared_prob.loc[taxon]
         tags_df[taxon]=[1 if a!=1 and b==1 else 0 for a, b in zip(col,row)]
     sum_tags=tags_df.apply(max, axis=0)
     reserved_taxons=list(sum_tags[sum_tags==0].index)
     binary_b=binary_b[reserved_taxons]
     '''
     return binary_b
예제 #2
0
    def match_ensembl_fa(self, gtf_file):
        #get order of chr
        ref_arr = self.fa_displayid()
        #print ref_arr

        #export matched gtf
        file_head = myIO.file_os(self.biofile).file_prefix()
        out_gtf = file_head + '.' + myIO.file_os(gtf_file).name_suffix()
        out_obj = open(out_gtf, 'wt')  #out_gtf_file
        print('Match the first column of {} with {}, => {}'.format(
            gtf_file, self.biofile, out_gtf))
        for chr_id in ref_arr:
            n = 0
            #read gtf
            in_obj = self.readonly_handle(self.gtf_file)
            for line in in_obj:
                if not line.startswith('#'):
                    items = line.split("\t")
                    seqid = items[0]
                    if chr_id == seqid:
                        out_obj.write(line)
                        n += 1
            in_obj.close()
            print('{}:{}'.format(chr_id, n))
        out_obj.close()
예제 #3
0
    def match_ncbi_fa(self, gtf_file):
        #read matched information from gtf_file
        seqid_chr = genome(gtf_file).read_ncbi_gff('seqid', 'chromosome')
        #out_gtf_file
        file_head = myIO.file_os(self.biofile).file_prefix()
        out_gtf = file_head + '.' + myIO.file_os(gtf_file).name_suffix()
        out_obj = open(out_gtf, 'wt')
        #get order of chr
        ref_arr = self.fa_displayid()

        #export matched gtf
        print('Match the first column of {} with {}, => {}'.format(
            gtf_file, self.biofile, out_gtf))
        for chr_id in ref_arr:
            chromosome = re.sub('chr', '', chr_id, flags=re.IGNORECASE)
            n = 0
            #read gtf
            in_obj = self.readonly_handle(self.biofile)
            for line in in_obj:
                if not line.startswith('#'):
                    items = line.split("\t")
                    seqid = items[0]
                    if seqid in seqid_chr and seqid_chr[seqid] == chromosome:
                        items[0] = chr_id
                        myline = "\t".join(items)
                        out_obj.write(myline)
                        n += 1
            in_obj.close()
            print('{}:{}'.format(chr_id, n))
        out_obj.close()
예제 #4
0
 def collapse_matrix(self, pars):
     infile, outfile, collapse_func=pars
     print('t:', outfile)
     #read counts_file: pep_df
     file_sep = '\t' if myIO.file_os(infile).name_suffix() == 'txt' else ','
     pep_df = pd.read_csv(infile, header=0, index_col=0, sep=file_sep, low_memory=False)
     #pep_df.index=pep_df.index.astype(str)
     sample_names = list(pep_df)
     #print(sample_names)
     #both column and row names should be string type
     pep_df.columns = pep_df.columns.astype(str)
     pep_df.index = pep_df.index.astype(str)
     #combined pep_df with protein annotation
     pep_pro = self.par['annot_df'][['pep_id', 'pro_id']]
     #pep_pro['pep_id']=pep_pro['pep_id'].astype(str)
     combined_df = pd.merge(pep_df, pep_pro, how='inner', left_index=True, right_on='pep_id')
     #group by protein id
     group_dict = combined_df.groupby(['pro_id'], as_index=False).groups
     collapse = dict()
     for protein_id, row_names in group_dict.items():
         subdf = pep_df.ix[row_names]
         collapse[protein_id] = subdf.apply(collapse_func, axis=0)
         #if protein_id =='A0A126':
             #print subdf[['CTLA4.BEADS_ONLY.BEADS_ONLY.BEADS_ONLY.20A20G.1', 'CTLA4.BEADS_ONLY.BEADS_ONLY.BEADS_ONLY.20A20G.2']]
     #convert to data frame and transpose
     cdf = pd.DataFrame(collapse).transpose()
     #remove the first column
     cdf = np.round(cdf[sample_names], 1)
     #reorder by row names
     cdf=cdf.loc[self.par['pro_ids']]
     #print cdf
     #export
     file_sep = '\t' if myIO.file_os(outfile).name_suffix() == 'txt' else ','
     cdf.to_csv(outfile, index_label='pro_id', sep=file_sep)
     return cdf
예제 #5
0
    def phipseq_alignment(self, sample_name):
        print('\n######Anslysis of {} will be trigerred!#####'.format(
            sample_name))
        #initiate sample par
        sample_var = dict(self.par)
        sample_var['start_time'] = time.time()
        #sample name
        sample_var['sample_name'] = sample_name
        #sample directory
        sample_dir = self.par['sample_dirs'][sample_name]
        sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir()
        print('\tSample directory: ', sample_var['sample_dir'])
        #raw data
        sample_var['sample_raw_files'] = ','.join(
            sample_var['sample_to_raw'][sample_name])
        print('\tRaw files: ', sample_var['sample_raw_files'])
        #export
        sample_var['file_head'] = sample_var['sample_dir'] + sample_name
        #default same file
        sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam'
        #file of read counts
        sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt'
        sample_var['sample_pro_sumRC_file'] = sample_var[
            'file_head'] + '_pro_sumRC.txt'
        sample_var['sample_pro_maxRC_file'] = sample_var[
            'file_head'] + '_pro_maxRC.txt'
        #file for saturation analysis
        sample_var['sample_saturation_file'] = sample_var[
            'file_head'] + '_saturation.txt'
        #sample log
        sample_var['sample_log'] = sample_var['file_head'] + '.log'

        #sequence alignment
        if sample_var['phip_alignment'] == 'yes':
            print("\n###sequence alignment", sample_var['tool_aligner'])
            #output is sam file
            if sample_var['tool_aligner'] == 'bowtie1':
                myAlign.alignment(sample_var).bowtie1_alignment()

        #counts reads
        if sample_var['phip_counting'] == 'yes':
            #RC matrix by peptides
            myAlign.alignment(sample_var).count_reads()
            #RC matrix by proteins
            if 'file_annotation' in self.par.keys():
                self.combine_peptides(sample_var)

        #update sample log
        sample_times = mySystem.system().get_time(sample_var['start_time'])
        sample_times['sample_name'] = sample_name
        myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
예제 #6
0
 def download_annot(self,genome_type):
     #print genome_type
     url = self.url[genome_type]
     #get html and the list of files
     url_dir, url_files = web(url).ls_html()
     #print url_files
     #download and decompress genome files
     local_chr_files = {}
     for file_url in url_files.keys():
         gz_file = myIO.file_os(file_url).download(self.out_dir)
         #decompress file
         ungz_file = myIO.file_os(gz_file).decompress_gz()
         local_chr_files[file_url]=ungz_file
     return local_chr_files
예제 #7
0
    def file_to_samples(self):
        #get all fastq files
        raw_files = self.seek_fq(self.par['dir_raw_data'])
        print('Number of raw files:', len(raw_files))
        #read sample info file
        print('Read sample file: ', self.par['file_sample_info'])
        in_obj = open(self.par['file_sample_info'], 'rt')

        #set connections between raw data and sample_name
        for line in in_obj:
            line = line.rstrip("\n")
            items = line.split(',')
            raw_file_name = items[0]
            sample_name = items[1]
            #print prefix
            for raw_file in raw_files:
                file_name = myIO.file_os(raw_file).file_name()
                #print file_name
                if file_name.find(raw_file_name) == 0:
                    #dict: raw_sample
                    self.raw_sample[raw_file] = sample_name
                    #dict: sample_raw
                    if sample_name in self.sample_raw:
                        self.sample_raw[sample_name].append(raw_file)
                    else:
                        self.sample_raw[sample_name] = [raw_file]   
        in_obj.close()
예제 #8
0
 def protein_peptides(self):
     pro_pep = {}
     #read annotation file
     annot_dict = myIO.file_os(self.par['file_annotation'], "\t").to_dict2()
     if 'Rnl2_SPIKEIN' in annot_dict: 
         annot_dict['Rnl2_SPIKEIN']['pep_rank'] = 0
     in_pro = [annot_dict[p]['pro_id'] for p in annot_dict.keys()]
     in_pro = list(set(in_pro))
     print('In proteins:{}, In peptides:{}'.format(in_pro.__len__(), annot_dict.keys().__len__()))
         
     ##
     pro_rank_pep = {}
     for pep_id in self.par['pep_ids']:
         pro_id = annot_dict[pep_id]['pro_id']
         pep_rank = annot_dict[pep_id]['pep_rank']
         pep_rank = int(pep_rank) if isinstance(pep_rank, int) else 0
         if pro_id in pro_rank_pep:
             pro_rank_pep[pro_id][pep_id] = pep_rank
         else:
             pro_rank_pep[pro_id] = {pep_id:pep_rank}
             #print pro_rank_pep[pro_id]
     #
     pep_num = 0
     for pro_id, pep_dict in pro_rank_pep.items():
         #print sorted(pep_dict.keys())
         peps = sorted(pro_rank_pep[pro_id], key = pro_rank_pep[pro_id].get)
         pep_num += len(peps)
         pro_pep[pro_id] = ','.join(peps)
     #export
     print("Number of protein:{}\tNumber of peptides:{}.".format(len(pro_pep.keys()), pep_num))
     myDict.basic(pro_pep, self.par['pro_ids']).dict_to_file(self.par['file_pro_pep'], "\t")
     #
     return pro_pep
예제 #9
0
 def download_annot(self,genome_type):
     #print genome_type
     url = self.url[genome_type]
     #get html
     lines = web(url).get_html()
     #get the list of files
     chr_files = self.single_file(lines)
     print(chr_files)
     #download and decompress genome files
     local_chr_files = {}
     for key in chr_files.keys():
         gz_file = myIO.file_os(url+chr_files[key]).download(self.out_dir)
         #decompress file
         ungz_file = myIO.file_os(gz_file).decompress_gz()
         local_chr_files[key] = ungz_file
     return local_chr_files
예제 #10
0
    def shrink_fq(self, par):
        #read relationship between barcode vs sample from sample_file
        barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict()
        #print barcode_sample

        #file handle
        m = 0
        n = 0
        F1 = self.readonly_handle(self.biofile)
        out1 = open(self.biofile + '.shrink', 'wt')
        F2 = self.readonly_handle(par['index_file'])
        out2 = open(par['index_file'] + '.shrink', 'wt')
        with F1, F2:
            #read 4 lines at a time per file
            for L1, La, L2, Lb, L3, Lc, L4, Ld in itertools.zip_longest(
                    *[F1, F2] * 4):
                barcode = Lb.rstrip()
                #assign record based on barcode
                if barcode in barcode_sample.keys():
                    #output file handle
                    out1.writelines([L1, L2, L3, L4])
                    out2.writelines([La, Lb, Lc, Ld])
                    m += 1
                n += 1
        F1.close()
        F2.close()
        out1.close()
        out2.close()
        print("{}->{}({})".format(n, m, m / n))
예제 #11
0
 def QC_statistics(self):
     print("###Quality control: statistics summary")
     #print(self.par['sample_names'])
     #print(self.par['dir_result'])
     stat_dict = collections.defaultdict(dict)
     for sample_name in self.par['sample_names']:
         sample_log = '{}{}/{}.log'.format(self.par['dir_result'], sample_name, sample_name)
         stat_dict[sample_name] = myIO.file_os(sample_log, '=').to_dict()
     #convert to data frame
     stat_df = pd.DataFrame(stat_dict)
     stat_df = stat_df.transpose()
     
     #1: scatter plot 1
     sub_df = stat_df[['raw_reads_num', 'unique_aligned_reads_num']].astype(float)/1e6
     #print sub_df
     plot_par={'df':sub_df, 'title':'raw_reads_vs_aligned_reads', 
               'picfile':self.par['dir_QC'] + 'raw_reads_vs_aligned_reads.png',
               'pch':'o', 'text':'million reads'}
     myPlot.plot(plot_par).dotP()
     #2: scatter plot 2
     stat_df['unique_aligned_percentage'] = sub_df['unique_aligned_reads_num']*100/sub_df['raw_reads_num']
     plot_par['df'] = stat_df[['raw_reads_num','unique_aligned_percentage']].astype(float)
     plot_par['title'] = 'percentage_aligned_reads'
     plot_par['picfile'] = self.par['dir_QC'] + 'percentage_aligned_reads.png'
     myPlot.plot(plot_par).dotP()
     #3: export to csv file
     print('\tSave statistical summary into {}.'.format(self.par['file_stat']))
     stat_df.to_csv(self.par['file_stat'], index_label='sample_names')
예제 #12
0
    def sig_polyclonal(self, count_file):
        #count_file = args_tuple
        print("Polyclonal analysis of ", count_file)
        comb_df, pep_df = myCommon.basic(self.par).combine_df(count_file)

        #functions
        def hits_func(x, peps, threshold, pro_id):
            #signficant hits
            hits = x[x >= threshold]
            #non_overlapping peptides
            peps = [str(x) for x in peps]
            hit_peps = [str(x) for x in hits.index]
            none_overlapped_hits_num = myList.basic(peps).un_neighbours(
                hit_peps, return_type='hits_num')
            #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num)
            #if len(hit_peps)>0: print pro_id, peps, hit_peps
            #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps
            return len(list(
                hits.index)), none_overlapped_hits_num, ','.join(hit_peps)

        #collapse by protein
        hits1 = {}
        hits2 = {}
        #n = 1
        for pro_id, row_index in comb_df.groupby('pro_id').groups.items():
            #row is protein id
            ##get protein-peptides annotations
            peps_str = self.par['dict_pro_pep'][pro_id]
            peps = peps_str.split(',')
            #df by protein
            sub_df = pep_df.ix[row_index]
            #print("{}\t{}".format(pro_id, list(sub_df.index)) )
            #hits num beyond zscore threshold
            hits_num = sub_df.apply(hits_func,
                                    axis=0,
                                    args=(peps, self.par['zscore_threshold'],
                                          pro_id))
            #if pro_id == 'Q9YLJ1': print hits_num
            #all number of significant hits
            num1 = [h[0] for h in hits_num]
            hits1[pro_id] = dict(zip(list(sub_df), list(num1)))
            #number of sig hits without overlapping
            num2 = [h[1] for h in hits_num]
            hits2[pro_id] = dict(zip(list(sub_df), list(num2)))
            #if (np.sum(num1))>10:
            #pd.set_option('display.max_columns', None)
            #pd.set_option('display.max_rows', None)
            #print np.matrix(np.round(sub_df))
            #print num1
            #print num2
            #n+ = 1
            #if n == 10: break

        #export
        file_head = myIO.file_os(count_file).file_prefix() + '_polyclonal'
        myDict.basic(hits1, self.par['pro_ids']).dict2_to_file(
            file_head + '.txt', "\t")
        myDict.basic(hits2, self.par['pro_ids']).dict2_to_file(
            file_head + '_nonoverlapped.txt', "\t")
예제 #13
0
 def annot_df(self, infile):
     file_sep = '\t' if myIO.file_os(infile).name_suffix() == 'txt' else ','
     annot_df = pd.read_csv(infile, header=0, index_col=None, sep=file_sep, low_memory=False)  
     #bother column and row names should be string type
     annot_df.index = annot_df['pep_id']
     annot_df.index = annot_df.index.astype(str)
     #annot_df.columns=self.par['annot_df'].columns.astype(str)
     return annot_df
예제 #14
0
 def __init__(self, biofile=None, sep=None):
     self.biofile = biofile
     #seperate character
     if sep is None:
         self.sep = ',' if myIO.file_os(
             self.biofile).name_suffix() == 'csv' else "\t"
     else:
         self.sep = sep
     self.record_num = 0
예제 #15
0
 def extract_annot(self, left_column, right_column, FUN):
     annot_dict = {}
     #read annotation file
     annot_df = myIO.file_os(self.par['file_annotation'], sep="\t").to_df(header=True, rowname=False)
     for index, row in annot_df.iterrows():
         key = row[left_column]
         value = FUN(row[right_column])
         annot_dict[key] = value
         #print "%s:%s" % (key, value)
     return annot_dict
예제 #16
0
 def QC_hits(self, infile, threshold=None):
     print('###Relationship between significant hits and raw read num of ', infile)
     file_prefix = '{}{}_'.format(self.par['dir_QC'], myIO.file_os(infile).name_prefix())
     if threshold is None: threshold = float(self.par['zscore_threshold'])
     #read statistics file
     stat_df = pd.read_table(self.par['file_stat'], sep=",", index_col=0, low_memory=False)
     stat_df.index = stat_df['sample_name'] #assign row names
     stat_df = stat_df.ix[self.par['sample_names']]#order rows by sample_names
     raw_reads = stat_df['raw_reads_num']/1e6
     #print stat_df[['sample_name','raw_reads_num']]
     #read values file
     in_df = pd.read_table(infile, sep="\t", index_col=0, low_memory=False)#rownames and colnames
     order_df = in_df[self.par['sample_names']].copy()#order columns
     #print(order_df.shape)
     
     #plot of raw reads vs number of hits
     #print list(order_df)
     def func1(x,y=threshold):
         sig = x[x>=y]
         return len(sig)
     hits_num = order_df.apply(func1, axis=0)
     #get compared df
     comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_num})
     comp_df.to_csv(file_prefix+'raw_vs_sighits.csv', sep=',')
     #plot
     plot_par={'df':comp_df, 'legend':None,
               'title': 'Effects of sequencing depth on significant hits',
               'picfile': file_prefix + 'raw_vs_sighits.png',
               'xlabel':'Number of raw reads (million)',
               'ylabel':'Number of signficant hits'}
     myPlot.plot(plot_par).dotP()
     
     #plot of raw reads vs mean values of hits
     #print list(order_df)
     def func2(x,y=threshold):
         x = pd.Series(x)
         #print list(x)
         sig = x[x>=y]
         #print list(sig)
         sig_mean = np.mean(sig)
         return sig_mean
     hits_mean = order_df.apply(func2, axis=0)
     #print hits_mean
     #get compared df
     comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_mean})
     outfile=file_prefix+'raw_vs_mean_significant_hits.csv'
     print('\texport QC to {}.'.format(outfile))
     comp_df.to_csv(outfile, sep=',')
     #plot
     plot_par={'df':comp_df, 'legend':None,
               'title': 'Effects of sequencing depth on significant hits',
               'picfile': file_prefix + 'raw_vs_mean_significant_hits.png',
               'xlabel':'Number of raw reads (million)',
               'ylabel':'Mean values of signficant hits'}
     myPlot.plot(plot_par).dotP()
예제 #17
0
    def normalize_by_factor(self, annot_file, col_index, col_factor='pro_len'):
        #read RC file
        #RC_df=pd.read_table(self.biofile, sep=self.sep, index_col=True)
        RC_df = myIO.file_os(self.biofile, sep=self.sep).to_df(header=True,
                                                               rowname=True)
        #print(RC_df.shape)

        #read annotation file
        #annot_df=pd.read_table(annot_file, sep="\t", index_col=True)
        annot_df = myIO.file_os(annot_file, sep="\t").to_df(header=True,
                                                            rowname=True)
        #print list(annot_df)
        sub_annot = annot_df[[col_index,
                              col_factor]].drop_duplicates([col_index],
                                                           take_last=True)
        sub_annot.index = list(sub_annot[col_index])
        sub_annot = sub_annot.ix[:, 1:]  #remove the column with col_index
        #print sub_annot

        #sort annot_df by row names of RC_df
        sub_annot = sub_annot.ix[list(RC_df.index)]
        #for missing proteins, pro_len equal ave-pro_len
        ave_pro_len = np.mean(sub_annot[col_factor])
        pro_len_df = sub_annot.fillna(ave_pro_len)
        #print(pro_len_df.shape)

        #normalization by aa length of proteins
        RC_df.insert(0, col_factor, list(pro_len_df[col_factor]))
        normRC_df = RC_df.apply(lambda x: x[1:] / x[0], axis=1)

        #scaling normalization by million reads
        def norm_func(x):
            sum_x = np.sum(x)
            norm_x = x * 10e6 / sum_x if sum_x > 0 else x
            norm_x = np.round(norm_x)
            norm_x = norm_x.astype(int)
            return norm_x

        normRC_df = normRC_df.apply(norm_func, axis=0)
        #print normRC_df

        return normRC_df
예제 #18
0
 def standard_df(self, infile, fill=True):
     #read txt file
     sep = ',' if myIO.file_os(infile).name_suffix() == 'csv' else '\t'
     stand_df = pd.read_table(infile,header=0, index_col=0,sep=sep,low_memory=False)
     #string of row names
     stand_df.index = [str(x) for x in list(stand_df.index)]
     stand_df.columns = [str(x) for x in list(stand_df)]
     #replace NAN
     if fill == True:
         stand_df.fillna(0, inplace=True)
     return stand_df
예제 #19
0
    def NC_whole_std(self):
        print('\tPolynomial regression of std~median across ALL BEADS-ONLY.')
        file_prefix = '{}{}_'.format(self.par['dir_result'], myIO.file_os(self.par['file_NC']).name_prefix())
        norm_ncfile = file_prefix+'scalingRC.txt'
        if os.path.isfile(norm_ncfile):
            phip_nc = pd.read_csv(norm_ncfile, sep='\t', index_col=0, low_memory=False)
        else:
            phip_nc = normalization(self.par, self.par['file_NC'], norm_ncfile).RC_scaling()
        #print(phip_nc.shape)
        
        #summary of nc: mean and std
        NC=pd.DataFrame({'mean':phip_nc.mean(axis=1), 'median':phip_nc.median(axis=1), \
                            'std':phip_nc.std(axis=1), 'sum':phip_nc.sum(axis=1)})
        NC['median'][NC['median']==0] = np.nan
        NC['std'][NC['std']==0] = np.nan
        NC['logmedian'] = np.log10(NC['median'])
        NC['logstd'] = np.log10(NC['std'])
        #NC=NC.replace([np.inf, -np.inf], -10) #an extreme small value
        #
        #initiate reg_df for regression
        #fill out outliers
        reg_df = NC.loc[(NC['median']>0),:].copy()
        #order for polynomial regression
        reg_df = reg_df.sort_values(['logmedian'], ascending=True)

        #polynomial regression
        formula = 'logstd~logmedian+I(logmedian**2)+I(logmedian**3)'
        pn_model = smf.ols(formula, data=reg_df)
        pn_fit = pn_model.fit()
        #print(pn_fit.params)
        reg_df['pred_logstd'] = pn_fit.predict()
        reg_df['pred_std'] = 10**pn_fit.predict()
        NC['pred_logstd'] = pn_fit.predict({'logmedian':NC['logmedian']})
        NC['pred_std'] = 10**NC['pred_logstd']
         
        #refresh total log
        #params=dict(pn_fit.params)
        #NC_dict = dict([('polynomial_NC_std:' + x, params[x]) for x in params.keys()])
        #myIO.file_os(self.par['file_total_log'], '=').line_replace(NC_dict)
        #export fitting of std
        NC.to_csv(file_prefix+'polynomial_std.csv', header=True, index_label='row_names')
        #draw graph
        xm=round(np.nanmax(list(NC['logmedian'])))
        ym=round(np.nanmax(list(NC['logstd'])))
        plot_par={'df': NC[['logmedian','logstd']], 'xlim':(-.5,xm), 'ylim':(-.5,ym),\
                  'picfile':file_prefix+'polynomial_std.png', 'text':pn_fit.params }
        try:
            myPlot.plot(plot_par).regressionP(reg_df['logmedian'], reg_df['pred_logstd'])
        except ValueError:
            print('Failed to drawing pic and save into {}'.format(plot_par['picfile']))
        
        #return fitting model object
        return NC, pn_fit
예제 #20
0
    def init_aligner_par(self):
        if self.par['tool_aligner'] == 'bowtie1':
            self.par['bowtie_aligner'] = self.par['dir_aligner'] + 'bowtie'
            self.par['bowtie_builder'] = self.par['dir_aligner'] + 'bowtie-build'
        elif self.par['tool_aligner'] == 'bowtie2':
            self.par['bowtie_aligner'] = self.par['dir_aligner'] + 'bowtie2'
            self.par['bowtie_builder'] = self.par['dir_aligner'] + 'bowtie2-build'
        #print self.par['bowtie_aligner'], self.par['bowtie_builder']

        #bowtie index
        self.par['bowtie_index_name'] = myIO.file_os(self.par['file_ref_fa']).file_prefix()
        self.par['bowtie_index'] = self.par['dir_aligner'] + self.par['bowtie_index_name']
예제 #21
0
 def export_df(self, outfile, threshold=10, index_label='row_names'):
     print('\texport data frame to ', outfile)
     outsep = ',' if outfile.endswith('.csv') else '\t'
     self.df.to_csv(outfile, sep=outsep, index_label=index_label) 
     
     #draw a scatterplot
     counts = self.df.apply(lambda x, y=threshold: len(x[x>=y]), axis=0)
     #print counts
     plot_par={'list':counts, 'ylabel':'Sample_names', 'xlabel':'Number of hits', 
               'picfile': myIO.file_os(outfile).file_prefix()+'.png',
               'title': 'Number of hits, threshold='+str(threshold) }
     myPlot.plot(plot_par).simple_barh()
예제 #22
0
 def raw_to_samples(self):
     #get all fastq files
     raw_files = self.seek_fq(self.par['dir_raw_data'])
     #print raw_files
     
     #connect raw file to sample name
     for raw_file in raw_files:
         sample_name = myIO.file_os(raw_file).name_prefix()
         self.raw_sample[raw_file] = sample_name
         if sample_name in self.sample_raw:
             self.sample_raw[sample_name].append(raw_file)
         else:
             self.sample_raw[sample_name] = [raw_file]
예제 #23
0
 def sample_info(self):
     sample_pairs = {}
     for raw_file, sample_name in self.raw_sample.items():
         raw_file_name = myIO.file_os(raw_file).file_name()
         group = 'NC' if 'BEADS' in raw_file_name.upper() else 'PhIP'
         if not 'unassigned' in raw_file_name:
             sample_name = re.sub('_R1', "", sample_name)
             pair = '{},{}'.format(raw_file_name, sample_name)
             sample_pairs[pair]=group
     #export dict to file
     print('Generate sample file: ', self.par['file_sample_info'])
     #order per record: fastq file name, sample_name, phip_group
     myDict.basic(sample_pairs).dict_to_file(self.par['file_sample_info'], ',')
예제 #24
0
    def combine_df(self, counts_file, annot_index='pep_id'):
        #read count file
        file_sep = '\t' if myIO.file_os(
            counts_file).name_suffix() == 'txt' else ','
        counts_df = pd.read_table(counts_file,
                                  sep=file_sep,
                                  index_col=0,
                                  low_memory=False)
        counts_df.index = [str(x) for x in counts_df.index]
        #print 'counts:', counts_df.shape
        #print list(counts_df.index)[:20]

        #read annotation file
        file_sep = '\t' if myIO.file_os(
            self.par['file_annotation']).name_suffix() == 'txt' else ','
        annot_df = pd.read_table(self.par['file_annotation'],
                                 sep=file_sep,
                                 index_col=None,
                                 low_memory=False)
        annot_df.index = [str(x) for x in annot_df[annot_index]]
        #print 'annot:', annot_df.shape
        #print list(annot_df[annot_index])[:20]

        #combine by rows
        comb_df = pd.merge(annot_df,
                           counts_df,
                           left_index=True,
                           right_index=True,
                           how='inner')
        comb_df.index = list(comb_df[annot_index])
        #comb_df=comb_df.rename(columns={self.par['protein_assoc']:'pro_id'})
        #print comb_df[['pep_id','row_name']]
        #print comb_df.shape
        #sample df
        sample_df = comb_df[self.par['sample_names']]
        sample_df.index = list(comb_df[annot_index])
        return (comb_df, sample_df)
예제 #25
0
 def download_idmapping(self):
     #get web file list
     url_idmapping = self.url+'knowledgebase/idmapping/by_organism/'
     web_dir, web_files = web(url_idmapping).ls_html()
     #print web_files
     
     #select file
     file_names = filter(lambda x: '.dat.' in x, web_files.values())
     file_names.sort()
     file_name = mySystem.system().select_key(file_names, 'Select web file')
     #download idmapping dat file
     url_file = url_idmapping + file_name
     local_file = self.out_dir + file_name
     web(url_file).download_file(local_file)
     #decompress file
     ungz_file = myIO.file_os(local_file).decompress_gz()
     print('Save ', url_file, ' as ', ungz_file)
     return ungz_file
예제 #26
0
 def download_dna(self):
     url = self.url['dna_fa']
     #get genome files
     #get html
     lines = web(url).get_html()
     chr_files = self.dna_files(lines)
     
     #download and decompress genome files
     local_chr_files = {}
     for key in chr_files.keys():
         self.ver = re.sub(r"\.chromosome.*", '', chr_files[key])
         gz_file = myIO.file_os(url+chr_files[key]).download(self.out_dir)
         #decompress file
         #ungz_file=myIO.file_os(gz_file).decompress_gz()
         local_chr_files[key] = gz_file
     #combine fa files
     out_file = self.out_dir+self.ver+'.fa'
     #print out_file
     myGenome.genome(out_file).combine_fa(local_chr_files)
     return local_chr_files, out_file
예제 #27
0
 def download_dna(self):
     #get html
     lines = web(self.url['dna_fa']).get_html()
     chr_files = self.dna_files(lines)
     
     #download and decompress genome files
     local_chr_files = {}
     for key in chr_files.keys():
         #release version
         self.ver = re.sub(r"_chr.*", '', chr_files[key])
         url = self.url['dna_fa']+chr_files[key]
         gz_file = myIO.file_os(url).download(self.out_dir)
         #decompress file
         #ungz_file=myIO.file_os(gz_file).decompress_gz()
         local_chr_files[key] = gz_file
     #combine fa files
     out_file = ''.join([self.out_dir, self.ver,'_dna.fa'])
     #print out_file
     myGenome.genome(out_file).combine_fa(local_chr_files)
     return local_chr_files, out_file
예제 #28
0
 def combine_countfiles(self, args_tuple):
     #row_names should be None or list type
     infile_tail, RC_level, out_file, row_names = args_tuple
     #
     counting_dict2 = {}
     for sample_name in self.par['sample_names']:
         #get read counts of a given sample
         counting_file = '{}{}/{}{}'.format(self.par['dir_result'], sample_name, sample_name, infile_tail)
         sample_dict2 = myIO.file_os(counting_file, '\t').to_dict2()
         for ref in sample_dict2.keys():
             #print ref
             counts = sample_dict2[ref][RC_level]
             if ref in counting_dict2:
                 counting_dict2[ref].update({sample_name:counts})
                 #print '=='+ref+'=='
             else:
                 counting_dict2[ref] = {sample_name:counts}
             #print sample_name, ref,counting_dict2[ref]
     #export counting_dict
     myDict.basic(counting_dict2).dict2_to_file(out_file=out_file, row_names=row_names)
예제 #29
0
 def trim_fq(self, outdir, seq_start=0, seq_end=0):
     file_name = myIO.file_os(self.biofile).file_name()
     outfile = outdir + re.sub('\.gz$', '', file_name)
     print("Trim fastq files {}, and save new file {}\n".format(
         self.biofile, outfile))
     #get file handles of the two fastq files, and the output file
     F1 = self.readonly_handle(self.biofile)
     out_obj = open(outfile, 'wt')
     with F1:
         #read 4 lines at a time per file
         for L1, L2, L3, L4 in zip(*[F1] * 4):
             if seq_start > 0:
                 L2 = L2[seq_start:]
                 L4 = L4[seq_start:]
             #trim the longer reads from 3-end
             if seq_end != 0:
                 L2 = L2.rstrip()
                 L4 = L4.rstrip()
                 L2 = L2[:seq_end] + "\n"
                 L4 = L4[:seq_end] + "\n"
             #export to the output file
             out_obj.writelines([L1, L2, L3, L4])
예제 #30
0
        '/home/yuan/results_phip',
        '/home-4/[email protected]/work/yuan/results_phip'
    ]
    #get all variables.txt
    files_var = []
    for d in dirs:
        if os.path.isdir(d):
            files_formula = os.path.join(d,
                                         '*' + file_type + '*/variables.txt')
            #print files_formula
            files_var += glob.glob(files_formula)
            #sub=myIO.dir_os(d).recrusive_files('variables.txt')
            #for s in sub:
            #    if file_type in s: files_var.append(s)

    #get all command lines
    for index, file_var in enumerate(files_var):
        print(index + 1, file_var)
        #revise the parameters of variables.txt
        myIO.file_os(file_var, '=').line_replace(par)

    #parallel processing
    #threads number
    #pool=mpd.Pool(processes=8)
    #pass one argument at a time
    #pool.map(phip_thread, files_var)
    #pool.close()
    #pool.join()

    print('\n\n\n\nGreat! The batch running is done!\n\n\n')
#end