Python VCF примеры использования

Язык программирования: Python

Пространство имен/Пакет: CGAT

Класс/Тип: VCF

Примеров на hotexamples.com: 3

Python VCF - 3 примера найдено. Это лучшие примеры Python кода для CGAT.VCF, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

VCFFile(3)

Основные методы

VCFFile (3)

Пример #1

Показать файл

Файл: vcf2vcf.py Проект: AndreasHegerGenomics/cgat-apps

def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("reorder-samples", ),
                      help="method to apply [default=%default]")

    parser.add_option("--sort-order",
                      dest="sort_order",
                      help="sort order for sample names. Give column names as "
                      "comma-separated list or specify ``alphabetical`` "
                      "[default=%default]")

    parser.set_defaults(
        methods=[],
        sort_order="alphabetical",
    )

    (options, args) = E.start(parser, add_pipe_options=True)

    if not options.methods:
        raise ValueError("no method specified")

    infile = VCF.VCFFile(options.stdin)

    sort_order = False
    if "reorder-samples" in options.methods:
        if options.sort_order:
            sort_order = options.sort_order.split(",")
            if "alphabetical" in sort_order:
                sort_order = sorted(infile.samples)

    infile.writeHeader(options.stdout, order=sort_order)

    for vcf in infile:
        if sort_order:
            vcf.order = sort_order
        options.stdout.write(str(vcf) + "\n")

    E.stop()

Пример #2

Показать файл

Файл: vcf2vcf.py Проект: santayana/cgat

def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--reorder",
        dest="reorder",
        type="string",
        help=
        "reorder columns. Give column names as comma-separated list or specify ``alphabetical`` [default=%default]"
    )

    parser.set_defaults(reorder=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    noutput = 0

    infile = VCF.VCFFile(options.stdin)

    if options.reorder:
        order = options.reorder.split(",")
        if "alphabetical" in order:
            order = sorted(infile.samples)
    else:
        order = False

    infile.writeHeader(options.stdout, order=order)

    for vcf in infile:
        if order:
            vcf.order = order
        options.stdout.write(str(vcf) + "\n")

    E.Stop()

Пример #3

Показать файл

def extractEBioinfo(eBio_ids, vcfs, outfile):
    '''find the number of mutations identitified in previous studies (eBio_ids)
    for the mutated genes in the vcfs'''

    genes = set()

    for vcf in vcfs:
        infile = VCF.VCFFile(IOTools.openFile(vcf))
        for vcf_entry in infile:
            # assumes all vcf entries without "REJECT" are "PASS"
            if vcf_entry.filter != "REJECT":
                info_entries = vcf_entry.info.split(";")
                for entry in info_entries:
                    if "SNPEFF_GENE_NAME" in entry:
                        genes.update((entry.split("=")[1], ))

    eBio_ids = IOTools.openFile(eBio_ids, "r")

    tissue_counts = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict(int)))

    def chunks(l, n):
        ''' Yield successive n-sized chunks from l '''
        for i in range(0, len(l), n):
            yield l[i:i + n]

    # delete me
    E.info("number of genes: %i" % len(list(genes)))

    for line in eBio_ids:
        tissue, study, table = line.strip().split("\t")

        n = 0

        for i in range(0, len(list(genes)), 250):

            genes_chunk = list(genes)[i:i + 250]

            # TS sporadic error when querying with a single gene at a time
            # "urllib2.URLError: <urlopen error [Errno 110] Connection timed out>"
            # max URL length appears to be 8200 characters,
            # try doing 250 genes at a time?

            gene_list = "+".join(list(genes_chunk))

            n += len(genes_chunk)

            E.info("number of genes processed: %i" % n)

            url = (
                "http://www.cbioportal.org/webservice.do?cmd=getProfileData&"
                "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&"
                "gene_list=%(gene_list)s" % locals())

            df = pd.io.parsers.read_csv(url,
                                        comment="#",
                                        sep="\t",
                                        index_col=0)

            for gene in genes_chunk:

                tmp_df = df[df['COMMON'] == gene]

                # check dataframe contains data!
                if tmp_df.shape[0] != 0:
                    # seem to be having issues with gene set containing duplicates!
                    # --> dataframe with repeated instances of gene after selection
                    # so splice to first row and recreate dataframe from series
                    if tmp_df.shape[0] > 1:
                        tmp_df = pd.DataFrame(tmp_df.iloc[0]).T

                    tissue_counts[tissue][gene]["total"] += tmp_df.shape[1] - 2
                    tissue_counts[tissue][gene]["mutations"] += int(
                        tmp_df.count(1)) - 1

    out = IOTools.openFile(outfile, "w")

    tissues = list(tissue_counts.keys())

    out.write(
        "gene\t%s\n" %
        "\t".join(["%s_frequency" % x.replace(" ", "_") for x in tissues]))

    for gene in genes:
        freq_values = []
        for tissue in tissues:
            total = tissue_counts[tissue][gene]["total"]
            mutations = tissue_counts[tissue][gene]["mutations"]
            freq_values.append(round(np.divide(float(mutations), total), 4))

        out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values))))

    out.close()