Пример #1
0
 def read(self,src=None,verbose=True,cid=None,rid=None, 
         col_inds=None, row_inds=None, matrix_only=False,
         frame=True, convert_to_double=False):
     '''
     reads data from src into metadata tables and data matrix 
     rid may be a list of probes
     alternatively, it may be a path to a .gct file
     '''
     #determine file type
     if not src:
         src = self.src
     extension = os.path.splitext(src)[1]
     try:
         if extension == '.gct':
             self._read_gct(src,verbose,frame=frame)
         elif extension == '.gctx':
             # check to see if cid is a file path and if it is, parse it as a grp
             if type(cid) == str and re.match('.*\.grp$', cid) and os.path.exists(cid):
                 cid = grp.read_grp(cid)
             # if the rid is a gct, load it. if it's epsilon or bing, retrieve the relevant genes
             if type(rid) == str:
                 if re.match('.*\.grp$', rid) and os.path.exists(rid):
                     rid = grp.read_grp(rid)
             # ditto 
             if matrix_only:
                 self.read_gctx_matrix(cid=cid,rid=rid,col_inds=col_inds,
                                         row_inds=row_inds,
                                         convert_to_double=convert_to_double)
             else:            
                 self._read_gctx(src,verbose=verbose,cid=cid,rid=rid,col_inds=col_inds,
                             row_inds=row_inds, frame=frame)
         else:
             raise GCTException("source file must be .gct or .gctx")
     except GCTException, (instance):
         print instance.message
Пример #2
0
 def read(self,src=None,verbose=True,cid=None,rid=None,
         col_inds=None, row_inds=None, matrix_only=False,
         frame=True, convert_to_double=False):
     '''
     reads data from src into metadata tables and data matrix
     rid may be a list of probes
     alternatively, it may be a path to a .gct file
     '''
     #determine file type
     if not src:
         src = self.src
     extension = os.path.splitext(src)[1]
     try:
         if extension == '.gct':
             self._read_gct(src,verbose,frame=frame)
         elif extension == '.gctx':
             # check to see if cid is a file path and if it is, parse it as a grp
             if type(cid) == str and re.match('.*\.grp$', cid) and os.path.exists(cid):
                 cid = grp.read_grp(cid)
             # if the rid is a gct, load it. if it's epsilon or bing, retrieve the relevant genes
             if type(rid) == str:
                 if re.match('.*\.grp$', rid) and os.path.exists(rid):
                     rid = grp.read_grp(rid)
             # ditto
             if matrix_only:
                 self.read_gctx_matrix(cid=cid,rid=rid,col_inds=col_inds,
                                         row_inds=row_inds,
                                         convert_to_double=convert_to_double)
             else:
                 self._read_gctx(src,verbose=verbose,cid=cid,rid=rid,col_inds=col_inds,
                             row_inds=row_inds, frame=frame)
         else:
             raise GCTException("source file must be .gct or .gctx")
     except GCTException, (instance):
         print instance.message
Пример #3
0
# read the full data file
GCTObject = gct.GCT(path_to_gctx_file)
GCTObject.read()
print(GCTObject.matrix)

# read the first 100 rows and 10 columns of the data
GCTObject = gct.GCT(path_to_gctx_file)
GCTObject.read(row_inds=range(100), col_inds=range(10))
print(GCTObject.matrix)

# read the first 10 columns of the data, identified by their
# column ids, stored in a grp file given below
path_to_column_ids = "/xchip/cogs/l1ktools/data/cids_n10.grp"
# read the column ids as a list
column_ids = grp.read_grp(path_to_column_ids)
GCTObject = gct.GCT(path_to_gctx_file)
# extract only the specified columns from the matrix
GCTObject.read(cid=column_ids)
print(GCTObject.matrix)

# get the available meta data headers for data columns and row
column_headers = GCTObject.get_chd()
row_headers = GCTObject.get_rhd()

# get the perturbagen description meta data field from the column data
inames = GCTObject.get_column_meta("pert_iname")

# get the gene symbol meta data field from the row data
symbols = GCTObject.get_row_meta("pr_gene_symbol")
Пример #4
0
# read the full data file
GCTObject = gct.GCT(path_to_gctx_file)
GCTObject.read()
print(GCTObject.matrix)

# read the first 100 rows and 10 columns of the data
GCTObject = gct.GCT(path_to_gctx_file)
GCTObject.read(row_inds=range(100), col_inds=range(10))
print(GCTObject.matrix)

# read the first 10 columns of the data, identified by their
# column ids, stored in a grp file given below
path_to_column_ids = '/cmap/tools/l1ktools/data/cids_n10.grp'
# read the column ids as a list
column_ids = grp.read_grp(path_to_column_ids)
GCTObject = gct.GCT(path_to_gctx_file)
# extract only the specified columns from the matrix
GCTObject.read(cid=column_ids)
print(GCTObject.matrix)

# get the available meta data headers for data columns and row
column_headers = GCTObject.get_chd()
row_headers = GCTObject.get_rhd()

# get the perturbagen description meta data field from the column data
inames = GCTObject.get_column_meta('pert_iname')

# get the gene symbol meta data field from the row data
symbols = GCTObject.get_row_meta('pr_gene_symbol')
Пример #5
0
def main():

    opt_parser = OptionParser()

    # Add Options. Required options should have default=None
    opt_parser.add_option("--sig_info",
                          dest="sig_info",
                          type="string",
                          help="""sig info file with gene information and distil
                                  information""",
                          default=None)
    opt_parser.add_option("--gctx",
                          dest="gctx",
                          type="string",
                          help="""GCTX file of pairwise similarity. Designed to
                                  use connectivity score, but any measurement of
                                  similarity can be used (e.g., pearson
                                  correlation)""",
                          default=None)
    opt_parser.add_option("--allele_col",
                          dest="allele_col",
                          type="string",
                          help="""Column name that indicates the allele names.
                                  DEF=%s""" % DEF_ALLELE_COL,
                          default=DEF_ALLELE_COL)
    opt_parser.add_option("-o",
                          dest="output_file_prefix",
                          type="string",
                          help="""Prefix of output files of mutation impact data.
                                  Includes figures of p-value distribution.""",
                          default=None)
    opt_parser.add_option("-r",
                          dest="reference_test_file",
                          type="string",
                          help="""File explicitly indicating which comparisons
                                  to do. Assumes the file has a header and it is
                                  ignored. The first column is the reference
                                  allele and second column is test allele. If
                                  this file is not given, then the reference
                                  alleles are assumed to be WT and inferred from
                                  the allele names.""",
                          default=None)
    opt_parser.add_option("-c",
                          dest="controls_file",
                          type="string",
                          help=""".grp file containing allele names of control
                                  perturbations. If this file is given, a null
                                  will be calculated from these""",
                          default=None)
#   opt_parser.add_option("--null",
#                         dest="out_null_file",
#                         type="string",
#                         help="""File of pairwise comparisons between alleles
#                                 from different genes""",
#                         default=None)
    opt_parser.add_option("-i",
                          dest="num_iterations",
                          type="int",
                          help="Number of iterations to run. DEF=%d" % NUM_ITERATIONS,
                          default=NUM_ITERATIONS)
    # opt_parser.add_option("--rep_null",
    #                       dest="rep_null_input",
    #                       type="string",
    #                       help="""Optional file containing rep null values from a previous
    #                               run. Should end in _rep_null.txt""",
    #                       default=None)
    opt_parser.add_option("--conn_null",
                          dest="conn_null_input",
                          type="string",
                          help="""Optional file containing connectvity null values from a previous
                                  run. Should end in _conn_null.txt""",
                          default=None)
    opt_parser.add_option("--ie_col",
                          dest="ie_col",
                          type="string",
                          help="""Name of the column with infection efficiency
                                  information. DEF=%s""" % DEF_IE_COL,
                          default=DEF_IE_COL)
    opt_parser.add_option("--ie_filter",
                          dest="ie_filter",
                          type="float",
                          help="""Threshold for infection efficiency. Any wildtype
                                  or mutant alleles having an ie below this
                                  threshold, will be removed""",
                          default=None)
    opt_parser.add_option("--num_reps",
                          dest="num_reps",
                          type="int",
                          help="""Number of replicates expected for each allele.
                                  DEF=%d""" % NUM_REPS,
                          default=NUM_REPS)
    opt_parser.add_option("--cell_id",
                          dest="cell_id",
                          type="string",
                          help="""Optional: Will only look at signatures from this cell
                                  line. Helps to filter sig_info file.""",
                          default=None)
    opt_parser.add_option("--plate_id",
                          dest="plate_id",
                          type="string",
                          help="""Optional: Will only look at signatures from
                                  this plate.""",
                          default=None)

    (options, args) = opt_parser.parse_args()

    # validate the command line arguments
    opt_parser.check_required("--sig_info")
    opt_parser.check_required("--gctx")
    opt_parser.check_required("-c")
    opt_parser.check_required("-o")

    sig_info_file = open(options.sig_info)
    output_file_prefix = open(options.output_file_prefix + ".txt", "w")

    # Output distribution files
    controls = grp.read_grp(options.controls_file)

    reference_test_filename = options.reference_test_file
    ref2test_allele = None
    if reference_test_filename:
        ref2test_allele = parseRefTestFile(reference_test_filename)

    allele_col = options.allele_col

    this_gctx = gct.GCT(options.gctx)
    this_gctx.read()

    num_iterations = options.num_iterations
    num_reps = options.num_reps

    ie_col = options.ie_col
    ie_filter = options.ie_filter

    # rep_null_input = options.rep_null_input
    conn_null_input = options.conn_null_input

    # if rep_null_input:
    #     rep_nulls_from_input_str = grp.read_grp(rep_null_input)
    #     rep_nulls_from_input = map(float, rep_nulls_from_input_str)
    if conn_null_input:
        conn_nulls_from_input_str = grp.read_grp(conn_null_input)
        conn_nulls_from_input = map(float, conn_nulls_from_input_str)

    (allele2distil_id,
     allele2WT,
     allele2gene,
     allele2cell_id,
     WT_alleles) = parse_sig_info(sig_info_file,
                                  ref2test_allele,
                                  allele_col,
                                  ie_col, ie_filter,
                                  options.cell_id,
                                  options.plate_id)

    clean_controls = []
    for this_control in controls:
        if this_control in allele2distil_id:
            clean_controls.append(this_control)


    replicate_null_dist, connectivity_null_dist = getNullDist(this_gctx,
                                                              allele2distil_id,
                                                              clean_controls,
                                                              num_iterations,
                                                              num_reps)

    # if rep_null_input:
    #     replicate_null_dist = rep_nulls_from_input
    if conn_null_input:
        connectivity_null_dist = conn_nulls_from_input

    # Print out percentiles of each null distribution
    # print "Replicate null percentiles"
    # print "2.5,5,10,50,90,95,97.5"
    rep_precentiles =  numpy.percentile(replicate_null_dist, [2.5,5,10,50,90,95,97.5])

    # if not rep_null_input:

    rep_null_distribution_out = open(options.output_file_prefix + "_rep_null.txt", "w")
    for x in replicate_null_dist:
        rep_null_distribution_out.write("%f\n" % x)
    rep_null_distribution_out.close()
    # print rep_precentiles

    # print "Connectivity null percentiles"
    # print "2.5,5,10,50,90,95,97.5"
    conn_percentiles = numpy.percentile(connectivity_null_dist, [2.5,5,10,50,90,95,97.5])
    if not conn_null_input:
        conn_null_dist_out = open(options.output_file_prefix + "_conn_null.txt", "w")
        for x in connectivity_null_dist:
            conn_null_dist_out.write("%f\n" % x)
        conn_null_dist_out.close()
    # print conn_percentiles

    # WT null data
    # {WT_allele:{"wt_rep": med_wt_rep,
    #             "wt_rep_dist":[]
    #             "wt_rep_pval": p_val vs null}
    WT_dict, wt_rep_pvals, wt_ordered = buildWT_dict(this_gctx, allele2distil_id, WT_alleles, replicate_null_dist,
                           num_reps)

    # Print header to output file
    output_file_prefix.write("gene\tmut\tmut_rep\twt_rep\tmut_wt_connectivity\t")
    output_file_prefix.write("wt\tcell_line\t")
    output_file_prefix.write("mut_wt_rep_pval\tmut_wt_conn_null_pval\twt_mut_rep_vs_wt_mut_conn_pval\t")
    output_file_prefix.write("mut_wt_rep_c_pval\tmut_wt_conn_null_c_pval\t")
    output_file_prefix.write("wt_mut_rep_vs_wt_mut_conn_c_pval\n")

    mut_rep_pvals = []
    mut_wt_rep_pvals = []
    mut_wt_conn_pvals = []
    mut_wt_rep_vs_wt_mut_conn_pvals = []

    outlines = []

    # Build comparison
    for allele in allele2WT:

        # Don't calculate for the WT allele
        if allele == allele2WT[allele]:
            continue

        mut_rankpt, mut_rankpt_dist = getSelfConnectivity(this_gctx,
                                                          allele2distil_id[allele],
                                                          num_reps)

        self_pval = getPairwiseComparisons(mut_rankpt_dist,
                                           replicate_null_dist)
        mut_rep_pvals.append(self_pval)

        mut_wt_conn_rankpt, mut_wt_conn_dist = getConnectivity(this_gctx,
                                                               allele2distil_id[allele],
                                                               allele2distil_id[allele2WT[allele]],
                                                               num_reps)

        conn_pval = getPairwiseComparisons(mut_wt_conn_dist,
                                           connectivity_null_dist)
        mut_wt_conn_pvals.append(conn_pval)

        mut_wt_rep_pval = getPairwiseComparisons(mut_rankpt_dist,
                                                 WT_dict[allele2WT[allele]]["wt_rep_dist"])
        mut_wt_rep_pvals.append(mut_wt_rep_pval)

#        wt_mut_rep_dist = WT_dict[allele2WT[allele]]["wt_rep_dist"] + mut_rankpt_dist
#        wt_mut_rep_dist = WT_dict[allele2WT[allele]]["wt_rep_dist"]
        # Through visualization, it makes more sense to do a Kruskal-Wallis test
        # to deterimine if the three categories are signficantly different.

#        wt_mut_rep_vs_wt_mut_conn_pval = getPairwiseComparisons(wt_mut_rep_dist, mut_wt_conn_dist)
        wt_mut_rep_vs_wt_mut_conn_pval = getKruskal(WT_dict[allele2WT[allele]]["wt_rep_dist"],
                                                    mut_rankpt_dist,
                                                    mut_wt_conn_dist)
        mut_wt_rep_vs_wt_mut_conn_pvals.append(wt_mut_rep_vs_wt_mut_conn_pval)

        out_elems = [allele2gene[allele],
                     allele,
                     "%f" % mut_rankpt,
                     "%f" % WT_dict[allele2WT[allele]]["wt_rep"],
                     "%f" % mut_wt_conn_rankpt,
                     allele2WT[allele],
                     allele2cell_id[allele],
                     "%f" % mut_wt_rep_pval,
                     "%f" % conn_pval,
                     "%f" % wt_mut_rep_vs_wt_mut_conn_pval]
        outline = "\t".join(out_elems)

        outlines.append(outline)


    # Calculate corrected pvalues
    mut_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_rep_pvals), "BH")
    wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(wt_rep_pvals), "BH")
    mut_wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_pvals), "BH")
    mut_wt_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_conn_pvals), "BH")
    mut_wt_rep_vs_wt_mut_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_vs_wt_mut_conn_pvals), "BH")





    # Write to file
    num_lines = len(outlines)
    for i in range(num_lines):
        this_outline = outlines[i]

        # this_outline += "\t%f\t" % mut_rep_c_pvals[i]

        # Getting wt c_pval
        this_outlist = outlines[i].split("\t")
        this_wt = this_outlist[WT_IDX]
        # wt_idx = wt_ordered.index(this_wt)

        # this_outline += "%f\t" % wt_rep_c_pvals[wt_idx]

        this_outline += "\t%f\t" % mut_wt_rep_c_pvals[i]
        this_outline += "%f\t" % mut_wt_conn_c_pvals[i]
        this_outline += "%f\n" % mut_wt_rep_vs_wt_mut_conn_c_pvals[i]

        output_file_prefix.write(this_outline)

    sys.exit(0)
Пример #6
0
def run_main(sig_info=None, gctx = None, allele_col = None, o = None, r = None,
             c = None, i = None, conn_null = None, ie_col = None,
             ie_filter = None, num_reps = None, cell_id = None, plate_id = None):


    #default values
    i = int(i) if i != None else int(1000)
    ie_col = str(ie_col) if ie_col != None else str(x_ie_a549)
    ie_filter = float(ie_filter) if ie_filter != None else float(0.0)
    num_reps = int(num_reps) if num_reps != None else int(3)


    sig_info_file = open(sig_info)
    output_file_prefix = open(o + ".txt", "w")

    # Output distribution files
    controls = grp.read_grp(c)

    reference_test_filename = r
    ref2test_allele = None
    if reference_test_filename:
        ref2test_allele = parseRefTestFile(reference_test_filename)

    this_gctx = gct.GCT(gctx)
    this_gctx.read()

    num_iterations = int(i)
    num_reps = int(num_reps)

    conn_null_input = conn_null

    if conn_null_input:
        conn_nulls_from_input_str = grp.read_grp(conn_null_input)
        conn_nulls_from_input = map(float, conn_nulls_from_input_str)

    (allele2distil_id,
     allele2WT,
     allele2gene,
     allele2cell_id,
     WT_alleles) = parse_sig_info(sig_info_file,
                                  ref2test_allele,
                                  allele_col,
                                  ie_col, ie_filter,
                                  cell_id,
                                  plate_id)

    clean_controls = []
    for this_control in controls:
        if this_control in allele2distil_id:
            clean_controls.append(this_control)


    #calculates if no inputs
    replicate_null_dist, connectivity_null_dist = getNullDist(this_gctx,
                                                                  allele2distil_id,
                                                                  clean_controls,
                                                                  num_iterations,
                                                                  num_reps)



    #overwrites conn_null_dist if its an input
    if conn_null_input:
        connectivity_null_dist = conn_nulls_from_input

    if not conn_null:
        conn_null_dist_out = open(o + "_conn_null.txt", "w")
        for x in connectivity_null_dist:
            conn_null_dist_out.write("%f\n" % x)
        conn_null_dist_out.close()



    WT_dict, wt_rep_pvals, wt_ordered = buildWT_dict(this_gctx, allele2distil_id, WT_alleles, replicate_null_dist,
                                                     num_reps)

    # Print header to output file
    output_file_prefix.write("gene\tmut\tmut_rep\twt_rep\tmut_wt_connectivity\t")
    output_file_prefix.write("wt\tcell_line\t")
    output_file_prefix.write("mut_wt_rep_pval\tmut_wt_conn_null_pval\twt_mut_rep_vs_wt_mut_conn_pval\t")
    output_file_prefix.write("mut_wt_rep_c_pval\tmut_wt_conn_null_c_pval\twt_mut_rep_vs_wt_mut_conn_c_pval\n")

    mut_rep_pvals = []
    mut_wt_rep_pvals = []
    mut_wt_conn_pvals = []
    mut_wt_rep_vs_wt_mut_conn_pvals = []

    outlines = []

    print allele2WT

    # Build comparison
    for allele in allele2WT:

	print allele

        # Don't calculate for the WT allele
        if allele == allele2WT[allele]:
            continue

        mut_rankpt, mut_rankpt_dist = getSelfConnectivity(this_gctx,
                                                          allele2distil_id[allele],
                                                          num_reps)

        self_pval = getPairwiseComparisons(mut_rankpt_dist,
                                           replicate_null_dist)
        mut_rep_pvals.append(self_pval)

        mut_wt_conn_rankpt, mut_wt_conn_dist = getConnectivity(this_gctx,
                                                               allele2distil_id[allele],
                                                               allele2distil_id[allele2WT[allele]],
                                                               num_reps)

        conn_pval = getPairwiseComparisons(mut_wt_conn_dist,
                                           connectivity_null_dist)
        mut_wt_conn_pvals.append(conn_pval)

        mut_wt_rep_pval = getPairwiseComparisons(mut_rankpt_dist,
                                                 WT_dict[allele2WT[allele]]["wt_rep_dist"])
        mut_wt_rep_pvals.append(mut_wt_rep_pval)


        wt_mut_rep_vs_wt_mut_conn_pval = getKruskal(WT_dict[allele2WT[allele]]["wt_rep_dist"],
                                                    mut_rankpt_dist,
                                                    mut_wt_conn_dist)
        mut_wt_rep_vs_wt_mut_conn_pvals.append(wt_mut_rep_vs_wt_mut_conn_pval)



        out_elems = [allele2gene[allele],
                     allele,
                     "%f" % mut_rankpt,
                     "%f" % WT_dict[allele2WT[allele]]["wt_rep"],
                     "%f" % mut_wt_conn_rankpt,
                     allele2WT[allele],
                     allele2cell_id[allele],
                     "%f" % mut_wt_rep_pval,
                     "%f" % conn_pval,
                     "%f" % wt_mut_rep_vs_wt_mut_conn_pval]
        outline = "\t".join(out_elems)
        outlines.append(outline)

    # Calculate corrected pvalues
    mut_wt_rep_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_pvals), "BH")
    mut_wt_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_conn_pvals), "BH")
    mut_wt_rep_vs_wt_mut_conn_c_pvals = robjects.r['p.adjust'](robjects.FloatVector(mut_wt_rep_vs_wt_mut_conn_pvals),
                                                               "BH")

    # Write to file
    num_lines = len(outlines)
    for i in range(num_lines):
        this_outline = outlines[i]

        this_outline += "\t%f\t" % mut_wt_rep_c_pvals[i]
        this_outline += "%f\t" % mut_wt_conn_c_pvals[i]
        this_outline += "%f\n" % mut_wt_rep_vs_wt_mut_conn_c_pvals[i]

        output_file_prefix.write(this_outline)