def setUp(self,): d = dict() d['ids'] = ['a','b','c','d','e'] d['frac1'] = [1.0,2.0,0.0,0.0,0.0] d['frac2'] = [0.0,0.0,3.0,4.0,0.0] d['frac3'] = [2.0,1.0,0.0,0.0,7.0] d['frac4'] = [0.0,0.0,6.0,8.0,10.0] df = pd.DataFrame(d, dtype=float) df = df.set_index('ids') self.elution = eff.Elut() self.elution.df = df d2 = dict() d2['ids'] = ['a','b','c','d'] d2['frac1'] = [1.0,1.0,0.0,2.0] d2['frac2'] = [0.0,0.0,1.0,4.0] d2['frac3'] = [2.0,1.0,0.0,0.0] d2['frac4'] = [0.0,0.0,6.0,10.0] df2 = pd.DataFrame(d2, dtype=float) df2 = df2.set_index('ids') self.elution2 = eff.Elut() self.elution2.df = df2 d3 = dict() d3['ids'] = ['a','b','c','d'] d3['frac1'] = [1.0,1.0,0.0,1.0] d3['frac2'] = [0.0,0.0,0.0,0.0] d3['frac3'] = [0.0,0.0,1.0,0.0] d3['frac4'] = [0.0,0.0,0.0,0.0] df3 = pd.DataFrame(d3, dtype=float) df3 = df3.set_index('ids') self.elution3 = eff.Elut() self.elution3.df = df3 d4 = dict() d4['ids'] = ['e','f','g','h'] d4['frac1'] = [10.0,10.0,0.0,20.0] d4['frac2'] = [0.0,0.0,10.0,40.0] d4['frac3'] = [20.0,10.0,0.0,0.0] d4['frac4'] = [0.0,0.0,60.0,10.0] df4 = pd.DataFrame(d4, dtype=float) df4 = df4.set_index('ids') self.elution4 = eff.Elut() self.elution4.df = df4
def main(): parser = argparse.ArgumentParser( description= "Calculate difference features between two fractionation experiments") parser.add_argument("--elution_files", action="store", nargs='+', dest="elution_files", required=True, help="Elution files (.elut)") parser.add_argument( "--features", action="store", nargs='+', dest="features", required=False, default=['diffrac'], help= "Features to calculate: diffrac (L1-norm of difference) diffrac_percent diffrac_normalized pearsonr poisson mean_abundance emd zscore sliding_zscore fdr_correct sliding_fdr_correct" ) parser.add_argument( "--annotated_list", action="store", dest="annotated_list", required=False, default=None, help= "Filename of annotated ids, used for calculating zscores from compliment of list, default=None" ) parser.add_argument("--contaminate_tag", action="store", dest="contaminate_tag", required=False, default='CONTAMINANT', help="Filters entries with tag, default=CONTAMINANT") parser.add_argument( "--use_gmm", action="store_true", dest="use_gmm", required=False, default=False, help= "Fit sliding window distributions to Gaussian Mixture Model and use largest gaussian for calculating zscore, default=False" ) parser.add_argument( "--log_transform", action="store_true", dest="log_transform", required=False, default=False, help= "Use the log transform of the diffrac score to calculate sliding zscore, default=False" ) parser.add_argument( "--window_size", action="store", type=int, dest="window_size", required=False, default=100, help="Window size to use for calculating sliding zscore, default=100") parser.add_argument( "--output_file", action="store", dest="out_filename", required=False, default=None, help="Filename of output file, default=None which prints to stdout") args = parser.parse_args() elutions = [] for efile in args.elution_files: elut = eff.Elut() elut.load(efile, format='tsv') elut.threshold(thresh=1) elutions.append(elut) feature_df = pd.DataFrame() if len(elutions) >= 2: if 'diffrac' in args.features: feature_series = calc_diffrac(elutions[0], elutions[1], normalize_totalCounts=False) feature_series.name = 'diffrac' feature_df = join_feature(feature_df, feature_series) if 'diffrac_percent' in args.features: feature_series = calc_diffrac(elutions[0], elutions[1], percent_totalCounts=True) feature_series.name = 'diffrac_percent' feature_df = join_feature(feature_df, feature_series) if 'diffrac_normalized' in args.features: feature_series = calc_diffrac(elutions[0], elutions[1], normalize_totalCounts=True) feature_series.name = 'diffrac_normalized' feature_df = join_feature(feature_df, feature_series) if 'emd' in args.features: feature_series = calc_emd(elutions[0], elutions[1]) feature_series.name = 'emd' feature_df = join_feature(feature_df, feature_series) if 'pearsonr' in args.features: feature_series = calc_correlation( elutions[0], elutions[1], correlation_func=lambda x, y: stats.pearsonr(x, y)[0]) feature_series.name = 'pearsonr' feature_df = join_feature(feature_df, feature_series) if 'poisson' in args.features: print("WARNING: poisson not implemented") #feature_series = calc_correlation(elutions[0], elutions[1]) #feature_series.name = 'poisson' #feature_df = join_feature(feature_df,feature_series) if 'mean_abundance' in args.features: feature_series = calc_mean_abundance(elutions[0], elutions[1]) feature_series.name = 'mean_abundance' feature_df = join_feature(feature_df, feature_series) if args.annotated_list != None: #kdrew: add in training labels annotated_df = pd.read_table(args.annotated_list, header=None, names=['annotated']) annotated = [ i in annotated_df['annotated'].values for i in feature_df.index ] feature_df['annotated'] = annotated print len(feature_df) try: feature_df = feature_df[~feature_df.index.str. contains('CONTAMINANT')] except AttributeError: print "No contaminants" print len(feature_df) if 'zscore' in args.features: if 'diffrac_normalized' not in args.features: #kdrew: calculating diffrac_normalized feature_series = calc_diffrac(elutions[0], elutions[1], normalize_totalCounts=False) feature_series.name = 'diffrac' feature_df = join_feature(feature_df, feature_series) feature_series = calc_zscore(feature_df) feature_series.name = 'zscore' feature_df = join_feature(feature_df, feature_series) if 'sliding_zscore' in args.features: feature_series = calc_sliding_zscore( feature_df, window=args.window_size, use_gmm=args.use_gmm, log_transform=args.log_transform) feature_series.name = 'sliding_zscore' feature_df = join_feature(feature_df, feature_series) if 'fdr_correct' in args.features: fdr_df = calc_fdr_correct(feature_df) feature_df = join_feature(feature_df, fdr_df) if 'sliding_fdr_correct' in args.features: sliding_fdr_df = calc_sliding_fdr_correct(feature_df) feature_df = join_feature(feature_df, sliding_fdr_df) if args.out_filename != None: feature_df.sort_values(args.features[0], ascending=False).to_csv(args.out_filename) else: print feature_df.sort_values(args.features[0], ascending=False)