예제 #1
0
 def QC_statistics(self):
     print("###Quality control: statistics summary")
     #print(self.par['sample_names'])
     #print(self.par['dir_result'])
     stat_dict = collections.defaultdict(dict)
     for sample_name in self.par['sample_names']:
         sample_log = '{}{}/{}.log'.format(self.par['dir_result'], sample_name, sample_name)
         stat_dict[sample_name] = myIO.file_os(sample_log, '=').to_dict()
     #convert to data frame
     stat_df = pd.DataFrame(stat_dict)
     stat_df = stat_df.transpose()
     
     #1: scatter plot 1
     sub_df = stat_df[['raw_reads_num', 'unique_aligned_reads_num']].astype(float)/1e6
     #print sub_df
     plot_par={'df':sub_df, 'title':'raw_reads_vs_aligned_reads', 
               'picfile':self.par['dir_QC'] + 'raw_reads_vs_aligned_reads.png',
               'pch':'o', 'text':'million reads'}
     myPlot.plot(plot_par).dotP()
     #2: scatter plot 2
     stat_df['unique_aligned_percentage'] = sub_df['unique_aligned_reads_num']*100/sub_df['raw_reads_num']
     plot_par['df'] = stat_df[['raw_reads_num','unique_aligned_percentage']].astype(float)
     plot_par['title'] = 'percentage_aligned_reads'
     plot_par['picfile'] = self.par['dir_QC'] + 'percentage_aligned_reads.png'
     myPlot.plot(plot_par).dotP()
     #3: export to csv file
     print('\tSave statistical summary into {}.'.format(self.par['file_stat']))
     stat_df.to_csv(self.par['file_stat'], index_label='sample_names')
예제 #2
0
def main():
    # prepareInput.createInput(logName)

    # scores=[]
    # #----------start Trace2Vec
    # Trace2Vec.learn(logName,vectorsize)
    # y=Trace2Vec.getY(logName)
    # vectors, corpus=Trace2Vec.startCluster(logName, vectorsize)
    # printMatrix(vectors, "Trace2Vec", "vectors")
    # for alg in clustering:
    #     assigned_clusters=cluster(alg, vectors, y)
    #     print(np.amax(assigned_clusters))
    #     printVector(assigned_clusters, "Trace2Vec", "clusters", alg)
    #     Trace2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus)

    # scores.append(get_scores("Trace2Vec"))
    # #----------end Trace2Vec

    # #----------start Node2Vec
    # args=Node2Vec.parse_args()
    # args.input="input/"+logName+".graph"
    # args.output="output/"+logName+"N2VVS"+str(vectorsize)+".node2vec"
    # nx_G = Node2Vec.read_graph(args)
    # G = node2vec.Graph(nx_G, True, args.p, args.q)
    # G.preprocess_transition_probs()
    # walks = G.simulate_walks(args.num_walks, args.walk_length)
    # Node2Vec.learn_embeddings(args, logName, vectorsize, walks)
    # Node2Vec.extract(logName, vectorsize)

    # y=Node2Vec.getY(logName)
    # vectors, corpus=Node2Vec.startCluster(logName, vectorsize)
    # printMatrix(vectors, "Node2Vec", "vectors")
    # for alg in clustering:
    #     assigned_clusters=cluster(alg, vectors, y)
    #     print(np.amax(assigned_clusters))
    #     printVector(assigned_clusters, "Node2Vec", "clusters", alg)
    #     Node2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus)

    # scores.append(get_scores("Node2Vec"))
    # #----------end Node2Vec

    # #----------start NGrams
    # vectors, y=NGrams.ngrams_BPI_2015(logName, vectorsize)
    # printMatrix(vectors, "NGrams", "vectors")
    # for alg in clustering:
    #     assigned_clusters=cluster(alg, vectors, y)
    #     print(np.amax(assigned_clusters))
    #     printVector(assigned_clusters, "NGrams", "clusters", alg)
    #     NGrams.endCluster(logName, assigned_clusters, vectorsize, alg, [0]*len(vectors))

    # scores.append(get_scores("NGrams"))
    # #----------end NGrams

    # for score in scores:
    #     print_scores(score)

    for emb in embed:
        myPlot.plot(emb)
예제 #3
0
 def QC_hits(self, infile, threshold=None):
     print('###Relationship between significant hits and raw read num of ', infile)
     file_prefix = '{}{}_'.format(self.par['dir_QC'], myIO.file_os(infile).name_prefix())
     if threshold is None: threshold = float(self.par['zscore_threshold'])
     #read statistics file
     stat_df = pd.read_table(self.par['file_stat'], sep=",", index_col=0, low_memory=False)
     stat_df.index = stat_df['sample_name'] #assign row names
     stat_df = stat_df.ix[self.par['sample_names']]#order rows by sample_names
     raw_reads = stat_df['raw_reads_num']/1e6
     #print stat_df[['sample_name','raw_reads_num']]
     #read values file
     in_df = pd.read_table(infile, sep="\t", index_col=0, low_memory=False)#rownames and colnames
     order_df = in_df[self.par['sample_names']].copy()#order columns
     #print(order_df.shape)
     
     #plot of raw reads vs number of hits
     #print list(order_df)
     def func1(x,y=threshold):
         sig = x[x>=y]
         return len(sig)
     hits_num = order_df.apply(func1, axis=0)
     #get compared df
     comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_num})
     comp_df.to_csv(file_prefix+'raw_vs_sighits.csv', sep=',')
     #plot
     plot_par={'df':comp_df, 'legend':None,
               'title': 'Effects of sequencing depth on significant hits',
               'picfile': file_prefix + 'raw_vs_sighits.png',
               'xlabel':'Number of raw reads (million)',
               'ylabel':'Number of signficant hits'}
     myPlot.plot(plot_par).dotP()
     
     #plot of raw reads vs mean values of hits
     #print list(order_df)
     def func2(x,y=threshold):
         x = pd.Series(x)
         #print list(x)
         sig = x[x>=y]
         #print list(sig)
         sig_mean = np.mean(sig)
         return sig_mean
     hits_mean = order_df.apply(func2, axis=0)
     #print hits_mean
     #get compared df
     comp_df = pd.DataFrame({'A': raw_reads, 'B': hits_mean})
     outfile=file_prefix+'raw_vs_mean_significant_hits.csv'
     print('\texport QC to {}.'.format(outfile))
     comp_df.to_csv(outfile, sep=',')
     #plot
     plot_par={'df':comp_df, 'legend':None,
               'title': 'Effects of sequencing depth on significant hits',
               'picfile': file_prefix + 'raw_vs_mean_significant_hits.png',
               'xlabel':'Number of raw reads (million)',
               'ylabel':'Mean values of signficant hits'}
     myPlot.plot(plot_par).dotP()
예제 #4
0
def main():
    prepareInput.createInput(logName)

    scores=[]
    #----------start Trace2Vec
    Trace2Vec.learn(logName,vectorsize)
    y=Trace2Vec.getY(logName)
    vectors, corpus=Trace2Vec.startCluster(logName, vectorsize)
    printMatrix(vectors, "Trace2Vec", "vectors")
    for alg in clustering:
        assigned_clusters=cluster(alg, vectors, y)
        printVector(assigned_clusters, "Trace2Vec", "clusters", alg)
        Trace2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus)
    #----------end Trace2Vec

    #----------start Node2Vec
    args=Node2Vec.parse_args()
    args.input="input/"+logName+".graph"
    args.output="output/"+logName+"N2VVS"+str(vectorsize)+".node2vec"
    nx_G = Node2Vec.read_graph(args)
    G = node2vec.Graph(nx_G, True, args.p, args.q)
    G.preprocess_transition_probs()
    walks = G.simulate_walks(args.num_walks, args.walk_length)
    Node2Vec.learn_embeddings(args, logName, vectorsize, walks)
    Node2Vec.extract(logName, vectorsize)
    
    y=Node2Vec.getY(logName)
    vectors, corpus=Node2Vec.startCluster(logName, vectorsize)
    printMatrix(vectors, "Node2Vec", "vectors")
    for alg in clustering:
        assigned_clusters=cluster(alg, vectors, y)
        printVector(assigned_clusters, "Node2Vec", "clusters", alg)
        Node2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus)
    #----------end Node2Vec

    #----------start NGrams
    vectors, y=NGrams.ngrams_BPI_2015(logName, vectorsize)
    printMatrix(vectors, "NGrams", "vectors")
    for alg in clustering:
        assigned_clusters=cluster(alg, vectors, y)
        printVector(assigned_clusters, "NGrams", "clusters", alg)
        NGrams.endCluster(logName, assigned_clusters, vectorsize, alg, [0]*len(vectors))
    #----------end NGrams

    scores.append(get_scores("Trace2Vec"))
    scores.append(get_scores("Node2Vec"))
    scores.append(get_scores("NGrams"))

    for score in scores:
        print_scores(score)
    
    if vectorsize==2:
        for emb in embed:
            myPlot.plot(emb)
예제 #5
0
    def NC_whole_std(self):
        print('\tPolynomial regression of std~median across ALL BEADS-ONLY.')
        file_prefix = '{}{}_'.format(self.par['dir_result'], myIO.file_os(self.par['file_NC']).name_prefix())
        norm_ncfile = file_prefix+'scalingRC.txt'
        if os.path.isfile(norm_ncfile):
            phip_nc = pd.read_csv(norm_ncfile, sep='\t', index_col=0, low_memory=False)
        else:
            phip_nc = normalization(self.par, self.par['file_NC'], norm_ncfile).RC_scaling()
        #print(phip_nc.shape)
        
        #summary of nc: mean and std
        NC=pd.DataFrame({'mean':phip_nc.mean(axis=1), 'median':phip_nc.median(axis=1), \
                            'std':phip_nc.std(axis=1), 'sum':phip_nc.sum(axis=1)})
        NC['median'][NC['median']==0] = np.nan
        NC['std'][NC['std']==0] = np.nan
        NC['logmedian'] = np.log10(NC['median'])
        NC['logstd'] = np.log10(NC['std'])
        #NC=NC.replace([np.inf, -np.inf], -10) #an extreme small value
        #
        #initiate reg_df for regression
        #fill out outliers
        reg_df = NC.loc[(NC['median']>0),:].copy()
        #order for polynomial regression
        reg_df = reg_df.sort_values(['logmedian'], ascending=True)

        #polynomial regression
        formula = 'logstd~logmedian+I(logmedian**2)+I(logmedian**3)'
        pn_model = smf.ols(formula, data=reg_df)
        pn_fit = pn_model.fit()
        #print(pn_fit.params)
        reg_df['pred_logstd'] = pn_fit.predict()
        reg_df['pred_std'] = 10**pn_fit.predict()
        NC['pred_logstd'] = pn_fit.predict({'logmedian':NC['logmedian']})
        NC['pred_std'] = 10**NC['pred_logstd']
         
        #refresh total log
        #params=dict(pn_fit.params)
        #NC_dict = dict([('polynomial_NC_std:' + x, params[x]) for x in params.keys()])
        #myIO.file_os(self.par['file_total_log'], '=').line_replace(NC_dict)
        #export fitting of std
        NC.to_csv(file_prefix+'polynomial_std.csv', header=True, index_label='row_names')
        #draw graph
        xm=round(np.nanmax(list(NC['logmedian'])))
        ym=round(np.nanmax(list(NC['logstd'])))
        plot_par={'df': NC[['logmedian','logstd']], 'xlim':(-.5,xm), 'ylim':(-.5,ym),\
                  'picfile':file_prefix+'polynomial_std.png', 'text':pn_fit.params }
        try:
            myPlot.plot(plot_par).regressionP(reg_df['logmedian'], reg_df['pred_logstd'])
        except ValueError:
            print('Failed to drawing pic and save into {}'.format(plot_par['picfile']))
        
        #return fitting model object
        return NC, pn_fit
예제 #6
0
 def export_df(self, outfile, threshold=10, index_label='row_names'):
     print('\texport data frame to ', outfile)
     outsep = ',' if outfile.endswith('.csv') else '\t'
     self.df.to_csv(outfile, sep=outsep, index_label=index_label) 
     
     #draw a scatterplot
     counts = self.df.apply(lambda x, y=threshold: len(x[x>=y]), axis=0)
     #print counts
     plot_par={'list':counts, 'ylabel':'Sample_names', 'xlabel':'Number of hits', 
               'picfile': myIO.file_os(outfile).file_prefix()+'.png',
               'title': 'Number of hits, threshold='+str(threshold) }
     myPlot.plot(plot_par).simple_barh()
예제 #7
0
 def export_regress(self, file_prefix):
     #export fitting of std
     plot_par = {
         'df': self.data[[self.x_name, self.y_name]],
         'xlim': (-.5, np.nanmax(self.data[self.x_name])),
         'ylim': (-.5, np.nanmax(self.data[self.x_name])),
         'text': self.lm['params']
     }
     if self.outdir is not None:
         file_prefix = '{}{}_{}'.format(file_prefix, self.x_name,
                                        self.y_name)
         self.data.to_csv(file_prefix + '.csv',
                          header=True,
                          index_label='row_names')
         plot_par['picfile'] = file_prefix + '.png'
     #draw graph
     try:
         myPlot.plot(plot_par).regressionP(self.reg_df[self.x_name],
                                           self.reg_df[self.py_name])
     except ValueError:
         print('Failed to drawding {}'.format(file_prefix))
예제 #8
0
    def QC_saturation(self):
        print("###saturation analysis\n")
        combined_df = {}
        combined_dynamics = {}
        #plot suaturation curve per sample
        #n=1
        for sample_name in self.par['sample_names']:
            file_head = '{}{}/'.format(self.par['dir_result'], sample_name)
            #read saturation file
            df = pd.read_table(file_head + 'QC_saturation.txt',
                               sep="\t",
                               index_col=False)
            #print list(df)
            #print list(df.index)

            #saturation curves
            saturation_df = df[['row_name', '1', '5', '10']]
            #shrink dict
            shrinked_index = myList.basic(list(
                saturation_df.index)).interval_list()
            #print shrinked_index
            sample_df = saturation_df.ix[shrinked_index]  #select rows
            #sample_df=sample_df.transpose().astype(float)
            sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6
            #print sample_df
            #scatter plot
            plot_par = {
                'df': sample_df,
                'legend': 'upper left',
                'title': 'Saturation analysis (Sequencing depth)',
                'picfile': file_head + 'QC_saturation_analysis.png',
                'xlabel': 'Number of raw reads (million)',
                'ylabel': 'Number of references'
            }
            myPlot.plot(plot_par).lineP()
            #combine data frame
            sample_df.index = range(sample_df.shape[0])
            for cutoff in ['1', '5', '10']:
                sub_df = sample_df[['row_name', cutoff]].copy()
                sub_df.columns = ['raw_reads:' + sample_name, sample_name]
                if cutoff in combined_df:
                    combined_df[cutoff] = pd.merge(combined_df[cutoff],
                                                   sub_df,
                                                   left_index=True,
                                                   right_index=True,
                                                   how='outer')
                else:
                    combined_df[cutoff] = sub_df.copy()

            #dynamics analysis
            dynamics_df = df[['row_name', 'max']]  #select df
            #shrink dict
            shrinked_index = myList.basic(list(
                dynamics_df.index)).interval_list()
            sample_df = dynamics_df.ix[shrinked_index]  #select rows
            sample_df.ix[:, 0] = sample_df.ix[:, 0] / 1e6  #divided by millions
            sample_df.reset_index(drop=True, inplace=True)
            #combined
            combined_dynamics[sample_name] = sample_df
            #plot
            plot_par = {
                'df': sample_df,
                'legend': 'upper left',
                'title': 'Saturation analysis:dynamics of read conts',
                'picfile': file_head + 'QC_read_counts_dynamics.png',
                'xlabel': 'Number of raw reads (million)',
                'ylabel': 'Maximum read counts'
            }
            myPlot.plot(plot_par).lineP()
        #export saturated curves
        for cutoff in ['1', '5', '10']:
            plot_par = {
                'df':
                combined_df[cutoff],
                'legend':
                None,
                'title':
                'samples={}, RC-cutoff={}'.format(
                    len(self.par['sample_names']), cutoff),
                'picfile':
                '{}saturation_cuttoff_{}.png'.format(self.par['dir_QC'],
                                                     cutoff),
                'xlabel':
                'Number of raw reads (million)',
                'ylabel':
                'Number of references'
            }
            myPlot.plot(plot_par).lineP(x_value=1)

        #export dynamics curves
        combined_dynamics = pd.concat(combined_dynamics, axis=1)
        combined_dynamics.columns = [
            ':'.join(x) for x in list(combined_dynamics)
        ]
        #print combined_dynamics.shape
        #print combined_dynamics
        plot_par = {
            'df':
            combined_dynamics,
            'legend':
            None,
            'title':
            'Sequencing depth,sample={}'.format(len(self.par['sample_names'])),
            'picfile':
            '{}saturation_dynamics.png'.format(self.par['dir_QC']),
            'xlabel':
            'Number of raw reads (million)',
            'ylabel':
            'Maximum read counts'
        }
        myPlot.plot(plot_par).lineP(x_value=1)