Пример #1
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("reorder-samples", ),
                      help="method to apply [default=%default]")

    parser.add_option("--sort-order",
                      dest="sort_order",
                      help="sort order for sample names. Give column names as "
                      "comma-separated list or specify ``alphabetical`` "
                      "[default=%default]")

    parser.set_defaults(
        methods=[],
        sort_order="alphabetical",
    )

    (options, args) = E.start(parser, add_pipe_options=True)

    if not options.methods:
        raise ValueError("no method specified")

    infile = VCF.VCFFile(options.stdin)

    sort_order = False
    if "reorder-samples" in options.methods:
        if options.sort_order:
            sort_order = options.sort_order.split(",")
            if "alphabetical" in sort_order:
                sort_order = sorted(infile.samples)

    infile.writeHeader(options.stdout, order=sort_order)

    for vcf in infile:
        if sort_order:
            vcf.order = sort_order
        options.stdout.write(str(vcf) + "\n")

    E.stop()
Пример #2
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--reorder",
        dest="reorder",
        type="string",
        help=
        "reorder columns. Give column names as comma-separated list or specify ``alphabetical`` [default=%default]"
    )

    parser.set_defaults(reorder=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    noutput = 0

    infile = VCF.VCFFile(options.stdin)

    if options.reorder:
        order = options.reorder.split(",")
        if "alphabetical" in order:
            order = sorted(infile.samples)
    else:
        order = False

    infile.writeHeader(options.stdout, order=order)

    for vcf in infile:
        if order:
            vcf.order = order
        options.stdout.write(str(vcf) + "\n")

    E.Stop()
Пример #3
0
def extractEBioinfo(eBio_ids, vcfs, outfile):
    '''find the number of mutations identitified in previous studies (eBio_ids)
    for the mutated genes in the vcfs'''

    genes = set()

    for vcf in vcfs:
        infile = VCF.VCFFile(IOTools.openFile(vcf))
        for vcf_entry in infile:
            # assumes all vcf entries without "REJECT" are "PASS"
            if vcf_entry.filter != "REJECT":
                info_entries = vcf_entry.info.split(";")
                for entry in info_entries:
                    if "SNPEFF_GENE_NAME" in entry:
                        genes.update((entry.split("=")[1], ))

    eBio_ids = IOTools.openFile(eBio_ids, "r")

    tissue_counts = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict(int)))

    def chunks(l, n):
        ''' Yield successive n-sized chunks from l '''
        for i in range(0, len(l), n):
            yield l[i:i + n]

    # delete me
    E.info("number of genes: %i" % len(list(genes)))

    for line in eBio_ids:
        tissue, study, table = line.strip().split("\t")

        n = 0

        for i in range(0, len(list(genes)), 250):

            genes_chunk = list(genes)[i:i + 250]

            # TS sporadic error when querying with a single gene at a time
            # "urllib2.URLError: <urlopen error [Errno 110] Connection timed out>"
            # max URL length appears to be 8200 characters,
            # try doing 250 genes at a time?

            gene_list = "+".join(list(genes_chunk))

            n += len(genes_chunk)

            E.info("number of genes processed: %i" % n)

            url = (
                "http://www.cbioportal.org/webservice.do?cmd=getProfileData&"
                "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&"
                "gene_list=%(gene_list)s" % locals())

            df = pd.io.parsers.read_csv(url,
                                        comment="#",
                                        sep="\t",
                                        index_col=0)

            for gene in genes_chunk:

                tmp_df = df[df['COMMON'] == gene]

                # check dataframe contains data!
                if tmp_df.shape[0] != 0:
                    # seem to be having issues with gene set containing duplicates!
                    # --> dataframe with repeated instances of gene after selection
                    # so splice to first row and recreate dataframe from series
                    if tmp_df.shape[0] > 1:
                        tmp_df = pd.DataFrame(tmp_df.iloc[0]).T

                    tissue_counts[tissue][gene]["total"] += tmp_df.shape[1] - 2
                    tissue_counts[tissue][gene]["mutations"] += int(
                        tmp_df.count(1)) - 1

    out = IOTools.openFile(outfile, "w")

    tissues = list(tissue_counts.keys())

    out.write(
        "gene\t%s\n" %
        "\t".join(["%s_frequency" % x.replace(" ", "_") for x in tissues]))

    for gene in genes:
        freq_values = []
        for tissue in tissues:
            total = tissue_counts[tissue][gene]["total"]
            mutations = tissue_counts[tissue][gene]["mutations"]
            freq_values.append(round(np.divide(float(mutations), total), 4))

        out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values))))

    out.close()