def read(self,src=None,verbose=True,cid=None,rid=None, col_inds=None, row_inds=None, matrix_only=False, frame=True, convert_to_double=False): ''' reads data from src into metadata tables and data matrix rid may be a list of probes alternatively, it may be a path to a .gct file ''' #determine file type if not src: src = self.src extension = os.path.splitext(src)[1] try: if extension == '.gct': self._read_gct(src,verbose,frame=frame) elif extension == '.gctx': # check to see if cid is a file path and if it is, parse it as a grp if type(cid) == str and re.match('.*\.grp$', cid) and os.path.exists(cid): cid = grp.read_grp(cid) # if the rid is a gct, load it. if it's epsilon or bing, retrieve the relevant genes if type(rid) == str: if re.match('.*\.grp$', rid) and os.path.exists(rid): rid = grp.read_grp(rid) # ditto if matrix_only: self.read_gctx_matrix(cid=cid,rid=rid,col_inds=col_inds, row_inds=row_inds, convert_to_double=convert_to_double) else: self._read_gctx(src,verbose=verbose,cid=cid,rid=rid,col_inds=col_inds, row_inds=row_inds, frame=frame) else: raise GCTException("source file must be .gct or .gctx") except GCTException, (instance): print instance.message
# read the full data file GCTObject = gct.GCT(path_to_gctx_file) GCTObject.read() print(GCTObject.matrix) # read the first 100 rows and 10 columns of the data GCTObject = gct.GCT(path_to_gctx_file) GCTObject.read(row_inds=range(100), col_inds=range(10)) print(GCTObject.matrix) # read the first 10 columns of the data, identified by their # column ids, stored in a grp file given below path_to_column_ids = "/xchip/cogs/l1ktools/data/cids_n10.grp" # read the column ids as a list column_ids = grp.read_grp(path_to_column_ids) GCTObject = gct.GCT(path_to_gctx_file) # extract only the specified columns from the matrix GCTObject.read(cid=column_ids) print(GCTObject.matrix) # get the available meta data headers for data columns and row column_headers = GCTObject.get_chd() row_headers = GCTObject.get_rhd() # get the perturbagen description meta data field from the column data inames = GCTObject.get_column_meta("pert_iname") # get the gene symbol meta data field from the row data symbols = GCTObject.get_row_meta("pr_gene_symbol")
# read the full data file GCTObject = gct.GCT(path_to_gctx_file) GCTObject.read() print(GCTObject.matrix) # read the first 100 rows and 10 columns of the data GCTObject = gct.GCT(path_to_gctx_file) GCTObject.read(row_inds=range(100), col_inds=range(10)) print(GCTObject.matrix) # read the first 10 columns of the data, identified by their # column ids, stored in a grp file given below path_to_column_ids = '/cmap/tools/l1ktools/data/cids_n10.grp' # read the column ids as a list column_ids = grp.read_grp(path_to_column_ids) GCTObject = gct.GCT(path_to_gctx_file) # extract only the specified columns from the matrix GCTObject.read(cid=column_ids) print(GCTObject.matrix) # get the available meta data headers for data columns and row column_headers = GCTObject.get_chd() row_headers = GCTObject.get_rhd() # get the perturbagen description meta data field from the column data inames = GCTObject.get_column_meta('pert_iname') # get the gene symbol meta data field from the row data symbols = GCTObject.get_row_meta('pr_gene_symbol')
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--sig_info", dest="sig_info", type="string", help="""sig info file with gene information and distil information""", default=None) opt_parser.add_option("--gctx", dest="gctx", type="string", help="""GCTX file of pairwise similarity. Designed to use connectivity score, but any measurement of similarity can be used (e.g., pearson correlation)""", default=None) opt_parser.add_option("--allele_col", dest="allele_col", type="string", help="""Column name that indicates the allele names. DEF=%s""" % DEF_ALLELE_COL, default=DEF_ALLELE_COL) opt_parser.add_option("-o", dest="output_file_prefix", type="string", help="""Prefix of output files of mutation impact data. Includes figures of p-value distribution.""", default=None) opt_parser.add_option("-r", dest="reference_test_file", type="string", help="""File explicitly indicating which comparisons to do. Assumes the file has a header and it is ignored. The first column is the reference allele and second column is test allele. If this file is not given, then the reference alleles are assumed to be WT and inferred from the allele names.""", default=None) opt_parser.add_option("-c", dest="controls_file", type="string", help=""".grp file containing allele names of control perturbations. If this file is given, a null will be calculated from these""", default=None) # opt_parser.add_option("--null", # dest="out_null_file", # type="string", # help="""File of pairwise comparisons between alleles # from different genes""", # default=None) opt_parser.add_option("-i", dest="num_iterations", type="int", help="Number of iterations to run. DEF=%d" % NUM_ITERATIONS, default=NUM_ITERATIONS) # opt_parser.add_option("--rep_null", # dest="rep_null_input", # type="string", # help="""Optional file containing rep null values from a previous # run. Should end in _rep_null.txt""", # default=None) opt_parser.add_option("--conn_null", dest="conn_null_input", type="string", help="""Optional file containing connectvity null values from a previous run. Should end in _conn_null.txt""", default=None) opt_parser.add_option("--ie_col", dest="ie_col", type="string", help="""Name of the column with infection efficiency information. DEF=%s""" % DEF_IE_COL, default=DEF_IE_COL) opt_parser.add_option("--ie_filter", dest="ie_filter", type="float", help="""Threshold for infection efficiency. Any wildtype or mutant alleles having an ie below this threshold, will be removed""", default=None) opt_parser.add_option("--num_reps", dest="num_reps", type="int", help="""Number of replicates expected for each allele. DEF=%d""" % NUM_REPS, default=NUM_REPS) opt_parser.add_option("--cell_id", dest="cell_id", type="string", help="""Optional: Will only look at signatures from this cell line. Helps to filter sig_info file.""", default=None) opt_parser.add_option("--plate_id", dest="plate_id", type="string", help="""Optional: Will only look at signatures from this plate.""", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("--sig_info") opt_parser.check_required("--gctx") opt_parser.check_required("-c") opt_parser.check_required("-o") sig_info_file = open(options.sig_info) output_file_prefix = open(options.output_file_prefix + ".txt", "w") # Output distribution files controls = grp.read_grp(options.controls_file) reference_test_filename = options.reference_test_file ref2test_allele = None if reference_test_filename: ref2test_allele = parseRefTestFile(reference_test_filename) allele_col = options.allele_col this_gctx = gct.GCT(options.gctx) this_gctx.read() num_iterations = options.num_iterations num_reps = options.num_reps ie_col = options.ie_col ie_filter = options.ie_filter # rep_null_input = options.rep_null_input conn_null_input = options.conn_null_input # if rep_null_input: # rep_nulls_from_input_str = grp.read_grp(rep_null_input) # rep_nulls_from_input = map(float, rep_nulls_from_input_str) if conn_null_input: conn_nulls_from_input_str = grp.read_grp(conn_null_input) conn_nulls_from_input = map(float, conn_nulls_from_input_str) (allele2distil_id, allele2WT, allele2gene, allele2cell_id, WT_alleles) = parse_sig_info(sig_info_file, ref2test_allele, allele_col, ie_col, ie_filter, options.cell_id, options.plate_id) clean_controls = [] for this_control in controls: if this_control in allele2distil_id: clean_controls.append(this_control) replicate_null_dist, connectivity_null_dist = getNullDist(this_gctx, allele2distil_id, clean_controls, num_iterations, num_reps) # if rep_null_input: # replicate_null_dist = rep_nulls_from_input if conn_null_input: connectivity_null_dist = conn_nulls_from_input # Print out percentiles of each null distribution # print "Replicate null percentiles" # print "2.5,5,10,50,90,95,97.5" rep_precentiles = numpy.percentile(replicate_null_dist, [2.5,5,10,50,90,95,97.5]) # if not rep_null_input: rep_null_distribution_out = open(options.output_file_prefix + "_rep_null.txt", "w") for x in replicate_null_dist: rep_null_distribution_out.write("%f\n" % x) rep_null_distribution_out.close() # print rep_precentiles # print "Connectivity null percentiles" # print "2.5,5,10,50,90,95,97.5" conn_percentiles = numpy.percentile(connectivity_null_dist, [2.5,5,10,50,90,95,97.5]) if not conn_null_input: conn_null_dist_out = open(options.output_file_prefix + "_conn_null.txt", "w") for x in connectivity_null_dist: conn_null_dist_out.write("%f\n" % x) conn_null_dist_out.close() # print conn_percentiles # WT null data # {WT_allele:{"wt_rep": med_wt_rep, # "wt_rep_dist":[] # "wt_rep_pval": p_val vs null} WT_dict, wt_rep_pvals, wt_ordered = buildWT_dict(this_gctx, allele2distil_id, WT_alleles, replicate_null_dist, num_reps) # Print header to output file output_file_prefix.write("gene\tmut\tmut_rep\twt_rep\tmut_wt_connectivity\t") output_file_prefix.write("wt\tcell_line\t") output_file_prefix.write("mut_wt_rep_pval\tmut_wt_conn_null_pval\twt_mut_rep_vs_wt_mut_conn_pval\t") output_file_prefix.write("mut_wt_rep_c_pval\tmut_wt_conn_null_c_pval\t") output_file_prefix.write("wt_mut_rep_vs_wt_mut_conn_c_pval\n") mut_rep_pvals = [] mut_wt_rep_pvals = [] mut_wt_conn_pvals = [] mut_wt_rep_vs_wt_mut_conn_pvals = [] outlines = [] # Build comparison for allele in allele2WT: # Don't calculate for the WT allele if allele == allele2WT[allele]: continue mut_rankpt, mut_rankpt_dist = getSelfConnectivity(this_gctx, allele2distil_id[allele], num_reps) self_pval = getPairwiseComparisons(mut_rankpt_dist, replicate_null_dist) mut_rep_pvals.append(self_pval) mut_wt_conn_rankpt, mut_wt_conn_dist = getConnectivity(this_gctx, allele2distil_id[allele], allele2distil_id[allele2WT[allele]], num_reps) conn_pval = getPairwiseComparisons(mut_wt_conn_dist, connectivity_null_dist) mut_wt_conn_pvals.append(conn_pval) mut_wt_rep_pval = getPairwiseComparisons(mut_rankpt_dist, WT_dict[allele2WT[allele]]["wt_rep_dist"]) mut_wt_rep_pvals.append(mut_wt_rep_pval) # wt_mut_rep_dist = WT_dict[allele2WT[allele]]["wt_rep_dist"] + mut_rankpt_dist # wt_mut_rep_dist = WT_dict[allele2WT[allele]]["wt_rep_dist"] # Through visualization, it makes more sense to do a Kruskal-Wallis test # to deterimine if the three categories are signficantly different. # wt_mut_rep_vs_wt_mut_conn_pval = getPairwiseComparisons(wt_mut_rep_dist, mut_wt_conn_dist) wt_mut_rep_vs_wt_mut_conn_pval = getKruskal(WT_dict[allele2WT[allele]]["wt_rep_dist"], mut_rankpt_dist, mut_wt_conn_dist) mut_wt_rep_vs_wt_mut_conn_pvals.append(wt_mut_rep_vs_wt_mut_conn_pval) out_elems = [allele2gene[allele], allele, "%f" % mut_rankpt, "%f" % WT_dict[allele2WT[allele]]["wt_rep"], "%f" % mut_wt_conn_rankpt, allele2WT[allele], allele2cell_id[allele], "%f" % mut_wt_rep_pval, "%f" % conn_pval, "%f" % wt_mut_rep_vs_wt_mut_conn_pval] outline = "\t".join(out_elems) outlines.append(outline) # Calculate corrected pvalues mut_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_rep_pvals), "BH") wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(wt_rep_pvals), "BH") mut_wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_pvals), "BH") mut_wt_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_conn_pvals), "BH") mut_wt_rep_vs_wt_mut_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_vs_wt_mut_conn_pvals), "BH") # Write to file num_lines = len(outlines) for i in range(num_lines): this_outline = outlines[i] # this_outline += "\t%f\t" % mut_rep_c_pvals[i] # Getting wt c_pval this_outlist = outlines[i].split("\t") this_wt = this_outlist[WT_IDX] # wt_idx = wt_ordered.index(this_wt) # this_outline += "%f\t" % wt_rep_c_pvals[wt_idx] this_outline += "\t%f\t" % mut_wt_rep_c_pvals[i] this_outline += "%f\t" % mut_wt_conn_c_pvals[i] this_outline += "%f\n" % mut_wt_rep_vs_wt_mut_conn_c_pvals[i] output_file_prefix.write(this_outline) sys.exit(0)
def run_main(sig_info=None, gctx = None, allele_col = None, o = None, r = None, c = None, i = None, conn_null = None, ie_col = None, ie_filter = None, num_reps = None, cell_id = None, plate_id = None): #default values i = int(i) if i != None else int(1000) ie_col = str(ie_col) if ie_col != None else str(x_ie_a549) ie_filter = float(ie_filter) if ie_filter != None else float(0.0) num_reps = int(num_reps) if num_reps != None else int(3) sig_info_file = open(sig_info) output_file_prefix = open(o + ".txt", "w") # Output distribution files controls = grp.read_grp(c) reference_test_filename = r ref2test_allele = None if reference_test_filename: ref2test_allele = parseRefTestFile(reference_test_filename) this_gctx = gct.GCT(gctx) this_gctx.read() num_iterations = int(i) num_reps = int(num_reps) conn_null_input = conn_null if conn_null_input: conn_nulls_from_input_str = grp.read_grp(conn_null_input) conn_nulls_from_input = map(float, conn_nulls_from_input_str) (allele2distil_id, allele2WT, allele2gene, allele2cell_id, WT_alleles) = parse_sig_info(sig_info_file, ref2test_allele, allele_col, ie_col, ie_filter, cell_id, plate_id) clean_controls = [] for this_control in controls: if this_control in allele2distil_id: clean_controls.append(this_control) #calculates if no inputs replicate_null_dist, connectivity_null_dist = getNullDist(this_gctx, allele2distil_id, clean_controls, num_iterations, num_reps) #overwrites conn_null_dist if its an input if conn_null_input: connectivity_null_dist = conn_nulls_from_input if not conn_null: conn_null_dist_out = open(o + "_conn_null.txt", "w") for x in connectivity_null_dist: conn_null_dist_out.write("%f\n" % x) conn_null_dist_out.close() WT_dict, wt_rep_pvals, wt_ordered = buildWT_dict(this_gctx, allele2distil_id, WT_alleles, replicate_null_dist, num_reps) # Print header to output file output_file_prefix.write("gene\tmut\tmut_rep\twt_rep\tmut_wt_connectivity\t") output_file_prefix.write("wt\tcell_line\t") output_file_prefix.write("mut_wt_rep_pval\tmut_wt_conn_null_pval\twt_mut_rep_vs_wt_mut_conn_pval\t") output_file_prefix.write("mut_wt_rep_c_pval\tmut_wt_conn_null_c_pval\twt_mut_rep_vs_wt_mut_conn_c_pval\n") mut_rep_pvals = [] mut_wt_rep_pvals = [] mut_wt_conn_pvals = [] mut_wt_rep_vs_wt_mut_conn_pvals = [] outlines = [] print allele2WT # Build comparison for allele in allele2WT: print allele # Don't calculate for the WT allele if allele == allele2WT[allele]: continue mut_rankpt, mut_rankpt_dist = getSelfConnectivity(this_gctx, allele2distil_id[allele], num_reps) self_pval = getPairwiseComparisons(mut_rankpt_dist, replicate_null_dist) mut_rep_pvals.append(self_pval) mut_wt_conn_rankpt, mut_wt_conn_dist = getConnectivity(this_gctx, allele2distil_id[allele], allele2distil_id[allele2WT[allele]], num_reps) conn_pval = getPairwiseComparisons(mut_wt_conn_dist, connectivity_null_dist) mut_wt_conn_pvals.append(conn_pval) mut_wt_rep_pval = getPairwiseComparisons(mut_rankpt_dist, WT_dict[allele2WT[allele]]["wt_rep_dist"]) mut_wt_rep_pvals.append(mut_wt_rep_pval) wt_mut_rep_vs_wt_mut_conn_pval = getKruskal(WT_dict[allele2WT[allele]]["wt_rep_dist"], mut_rankpt_dist, mut_wt_conn_dist) mut_wt_rep_vs_wt_mut_conn_pvals.append(wt_mut_rep_vs_wt_mut_conn_pval) out_elems = [allele2gene[allele], allele, "%f" % mut_rankpt, "%f" % WT_dict[allele2WT[allele]]["wt_rep"], "%f" % mut_wt_conn_rankpt, allele2WT[allele], allele2cell_id[allele], "%f" % mut_wt_rep_pval, "%f" % conn_pval, "%f" % wt_mut_rep_vs_wt_mut_conn_pval] outline = "\t".join(out_elems) outlines.append(outline) # Calculate corrected pvalues mut_wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_pvals), "BH") mut_wt_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_conn_pvals), "BH") mut_wt_rep_vs_wt_mut_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_vs_wt_mut_conn_pvals), "BH") # Write to file num_lines = len(outlines) for i in range(num_lines): this_outline = outlines[i] this_outline += "\t%f\t" % mut_wt_rep_c_pvals[i] this_outline += "%f\t" % mut_wt_conn_c_pvals[i] this_outline += "%f\n" % mut_wt_rep_vs_wt_mut_conn_c_pvals[i] output_file_prefix.write(this_outline)