예제 #1
0
def test_genome_arg_to_load_vcf_cached_75():
    eq_(load_vcf(HG19_VCF_FILENAME),
        load_vcf(HG19_VCF_FILENAME,
                 genome=cached_release(75), convert_ucsc_contig_names=True))
    assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
        HG19_VCF_FILENAME,
        genome=cached_release(75),
        convert_ucsc_contig_names=False)
예제 #2
0
    def test_load_vcf_external():
        variants = load_vcf(VCF_EXTERNAL_URL)
        assert variants.reference_names() == {"GRCh37"}
        assert len(variants) == 14

        variants = load_vcf(VCF_EXTERNAL_URL + ".gz")
        assert variants.reference_names() == {"GRCh37"}
        assert len(variants) == 14
예제 #3
0
    def test_load_vcf_external():
        variants = load_vcf(VCF_EXTERNAL_URL)
        assert variants.reference_names() == {"GRCh37"}
        assert len(variants) == 14

        variants = load_vcf(VCF_EXTERNAL_URL + ".gz")
        assert variants.reference_names() == {"GRCh37"}
        assert len(variants) == 14
예제 #4
0
def test_vcf_number_entries_duplicates():
    # There are 3 duplicated mutations listed in the VCF
    path_to_vcf_with_duplicates = data_path("duplicates.vcf")
    variants = load_vcf(path_to_vcf_with_duplicates, genome='hg38',
                        distinct=True)
    assert len(variants) == 1
    variants = load_vcf(path_to_vcf_with_duplicates, genome='hg38',
                        distinct=False)
    assert len(variants) == 3
예제 #5
0
    def test_load_vcf_external():
        variants = load_vcf(HG19_VCF_FILENAME)
        eq_(variants.reference_names(), {"GRCh37"})
        eq_(variants.original_reference_names(), {"hg19"})
        eq_(len(variants), 14)

        variants = load_vcf(HG19_VCF_FILENAME + ".gz")
        eq_(variants.reference_names(), {"GRCh37"})
        eq_(len(variants), 14)
예제 #6
0
def test_genome_arg_to_load_vcf_int_75():
    # if we use Ensembl 75 -- which is backed by GRCh37 -- then the two variant
    # collections will be the same as long as we also convert the contig names
    eq_(load_vcf(HG19_VCF_FILENAME),
        load_vcf(HG19_VCF_FILENAME, genome=75, convert_ucsc_contig_names=True))

    assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
        HG19_VCF_FILENAME,
        genome=75,
        convert_ucsc_contig_names=False)
예제 #7
0
def test_vcf_number_entries_duplicates():
    # There are 3 duplicated mutations listed in the VCF
    path_to_vcf_with_duplicates = data_path("duplicates.vcf")
    variants = load_vcf(path_to_vcf_with_duplicates,
                        genome='hg38',
                        distinct=True)
    assert len(variants) == 1
    variants = load_vcf(path_to_vcf_with_duplicates,
                        genome='hg38',
                        distinct=False)
    assert len(variants) == 3
예제 #8
0
def test_sample_info_genotype():
    variants = load_vcf(data_path("multiallelic.vcf"))
    assert len(variants) == 2, "Expected 2 variants but got %s" % variants
    eq_(variants.metadata[variants[0]]['sample_info']['metastasis']['GT'],
        '0/1')
    eq_(variants.metadata[variants[1]]['sample_info']['metastasis']['GT'],
        '0/1')
예제 #9
0
def _do_roundtrip_test(filenames):

    def load_fn(filename):
        return {
            'vcf': load_vcf,
            'maf': load_maf
        }[filename.split('.')[-1]]

    def load_variants():
        variant_collections = []
        for filename in filenames:
            variant_collections.append(load_fn(filename)(data_path(filename)))
        return variant_collections[0].union(*variant_collections[1:])

    variants = load_variants()

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        metadata = _merge_metadata_naive(variants)
        variants_to_vcf(variants, metadata, out=f)
        tmp_name = f.name
    reparsed_variants = load_vcf(tmp_name)

    # `==` checks the reference genome, which won't necessarily match.
    assert all(
        v1.contig == v2.contig and
        v1.start == v2.start and
        v1.ref == v2.ref and
        v1.start == v2.start
        for (v1, v2) in zip(variants, reparsed_variants))

    return (variants, reparsed_variants)
예제 #10
0
def test_sample_info_genotype():
    variants = load_vcf(data_path("multiallelic.vcf"))
    assert len(variants) == 2, "Expected 2 variants but got %s" % variants
    eq_(variants.metadata[variants[0]]['sample_info']['metastasis']['GT'],
        '0/1')
    eq_(variants.metadata[variants[1]]['sample_info']['metastasis']['GT'],
        '0/1')
예제 #11
0
def _do_roundtrip_test(filenames):
    def load_fn(filename):
        return {'vcf': load_vcf, 'maf': load_maf}[filename.split('.')[-1]]

    def load_variants():
        variant_collections = []
        for filename in filenames:
            variant_collections.append(load_fn(filename)(data_path(filename)))
        return variant_collections[0].union(*variant_collections[1:])

    variants = load_variants()

    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        metadata = _merge_metadata_naive(variants)
        variants_to_vcf(variants, metadata, out=f)
        tmp_name = f.name
    reparsed_variants = load_vcf(tmp_name)

    # `==` checks the reference genome, which won't necessarily match.
    assert all(
            v1.contig == v2.contig and \
            v1.start == v2.start and \
            v1.ref == v2.ref and \
            v1.start == v2.start \
            for (v1, v2) in zip(variants, reparsed_variants))

    return (variants, reparsed_variants)
예제 #12
0
def guess_ensembl_release(filepath):
    try:
        release = varcode.load_vcf(filepath)[0].ensembl.release
    except ValueError:  # no guesses from varcode, return default
        release = config.ENSEMBL_RELEASE
    except Exception:  # varcode cannot handle this one, so go w/ default
        release = config.ENSEMBL_RELEASE
    finally:
        return release
예제 #13
0
def test_multiple_alleles_per_line():
    variants = load_vcf(data_path("multiallelic.vcf"))
    assert len(variants) == 2, "Expected 2 variants but got %s" % variants
    variant_list = list(variants)
    expected_variants = [
        Variant(1, 1431105, "A", "C", genome="GRCh37"),
        Variant(1, 1431105, "A", "G", genome="GRCh37"),
    ]
    eq_(set(variant_list), set(expected_variants))
예제 #14
0
 def do_test(kwargs):
     vcf_pandas = load_vcf_fast(**kwargs)
     vcf_pyvcf = load_vcf(**kwargs)
     eq_(vcf_pandas, vcf_pyvcf)
     eq_(len(vcf_pandas), len(vcf_pyvcf))
     eq_(vcf_pandas.elements, vcf_pyvcf.elements)
     eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
     assert len(vcf_pandas) > 1
     assert len(vcf_pyvcf) > 1
예제 #15
0
 def do_test(kwargs):
     vcf_pandas = load_vcf_fast(**kwargs)
     vcf_pyvcf = load_vcf(**kwargs)
     eq_(vcf_pandas, vcf_pyvcf)
     eq_(len(vcf_pandas), len(vcf_pyvcf))
     eq_(vcf_pandas.elements, vcf_pyvcf.elements)
     eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
     assert len(vcf_pandas) > 1
     assert len(vcf_pyvcf) > 1
예제 #16
0
def test_multiple_alleles_per_line():
    variants = load_vcf(data_path("multiallelic.vcf"))
    assert len(variants) == 2, "Expected 2 variants but got %s" % variants
    variant_list = list(variants)
    ensembl = variant_list[0].ensembl
    expected_variants = [
        Variant(1, 1431105, "A", "C", ensembl=ensembl),
        Variant(1, 1431105, "A", "G", ensembl=ensembl),
    ]
    eq_(set(variant_list), set(expected_variants))
예제 #17
0
def variant_collection_from_args(args):
    variant_collections = []

    if args.reference_name:
        genome = genome_for_reference_name(args.reference_name)
    else:
        # no genome specified, assume it can be inferred from the file(s)
        # we're loading
        genome = None

    for vcf_path in args.vcf:
        vcf_variants = load_vcf(vcf_path, genome=genome)
        variant_collections.append(vcf_variants)
    for maf_path in args.maf:
        maf_variants = load_maf(maf_path)
        variant_collections.append(maf_variants)

    if args.variant:
        if not genome:
            raise ValueError(
                "--reference-name must be specified when using --variant")

        variants = [
            Variant(
                chromosome,
                start=position,
                ref=ref,
                alt=alt,
                ensembl=genome)
            for (chromosome, position, ref, alt)
            in args.variant
        ]
        variant_collection = VariantCollection(variants)
        variant_collections.append(variant_collection)

    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, or --variant options)")

    for json_path in args.json_variant_files:
        with open(json_path, 'r') as f:
            json_string = f.read()
            variant_collections.append(
                VariantCollection.from_json(json_string))
    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, --json-variants options)")
    elif len(variant_collections) == 1:
        return variant_collections[0]
    else:
        combined_variants = []
        for variant_collection in variant_collections:
            combined_variants.extend(list(variant_collection))
        return VariantCollection(combined_variants)
예제 #18
0
def load_variants(vcf_file=None, maf_file=None, max_variants=None):
    """Load variants from vcf file"""

    import varcode
    if vcf_file is not None:
        variants = varcode.load_vcf(vcf_file, allow_extended_nucleotides=True, max_variants=max_variants)
        f=vcf_file
    elif maf_file is not None:
        variants = varcode.load_maf(maf_file)
        f=maf_file
    print ('%s variants read from %s' %(len(variants),f))
    return variants
예제 #19
0
def test_genome_arg_to_load_vcf():
    variants = load_vcf(VCF_FILENAME)
    eq_(variants, load_vcf(VCF_FILENAME, genome=75))
    eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75)))
    eq_(variants, load_vcf(VCF_FILENAME, genome="grch37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="b37"))
    # TODO: actually make hg19 different from b37! They should use
    # different MT sequences
    eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
예제 #20
0
def test_genome_arg_to_load_vcf():
    variants = load_vcf(VCF_FILENAME)
    eq_(variants, load_vcf(VCF_FILENAME, genome=75))
    eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75)))
    eq_(variants, load_vcf(VCF_FILENAME, genome="grch37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="b37"))
    # TODO: actually make hg19 different from b37! They should use
    # different MT sequences
    eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
예제 #21
0
def variant_collection_from_args(args):
    variant_collections = []

    if args.reference_name:
        genome = genome_for_reference_name(args.reference_name)
    else:
        # no genome specified, assume it can be inferred from the file(s)
        # we're loading
        genome = None

    for vcf_path in args.vcf:
        vcf_variants = load_vcf(vcf_path, genome=genome)
        variant_collections.append(vcf_variants)
    for maf_path in args.maf:
        maf_variants = load_maf(maf_path)
        variant_collections.append(maf_variants)

    if args.variant:
        if not genome:
            raise ValueError(
                "--reference-name must be specified when using --variant")

        variants = [
            Variant(chromosome,
                    start=position,
                    ref=ref,
                    alt=alt,
                    ensembl=genome)
            for (chromosome, position, ref, alt) in args.variant
        ]
        variant_collection = VariantCollection(variants)
        variant_collections.append(variant_collection)

    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, or --variant options)")

    for json_path in args.json_variant_files:
        with open(json_path, 'r') as f:
            json_string = f.read()
            variant_collections.append(
                VariantCollection.from_json(json_string))
    if len(variant_collections) == 0:
        raise ValueError(
            "No variants loaded (use --maf, --vcf, --json-variants options)")
    elif len(variant_collections) == 1:
        return variant_collections[0]
    else:
        combined_variants = []
        for variant_collection in variant_collections:
            combined_variants.extend(list(variant_collection))
        return VariantCollection(combined_variants)
예제 #22
0
def test_load_vcf_local():
    variants = load_vcf(VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    variants = load_vcf(VCF_FILENAME + ".gz")
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    variants = load_vcf("file://%s" % VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    variants = load_vcf("file://%s.gz" % VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    # An extra slashe before an absolute path can confuse URL parsing.
    # Test that it can still be opened:
    variants = load_vcf("/%s" % VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14
예제 #23
0
def test_load_vcf_local():
    variants = load_vcf(VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    variants = load_vcf(VCF_FILENAME + ".gz")
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    variants = load_vcf("file://%s" % VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    variants = load_vcf("file://%s.gz" % VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14

    # An extra slashe before an absolute path can confuse URL parsing.
    # Test that it can still be opened:
    variants = load_vcf("/%s" % VCF_FILENAME)
    assert variants.reference_names() == {"GRCh37"}
    assert len(variants) == 14
예제 #24
0
def _build_variant_table(in_file, out_file, genome_version='GRCh37'):
    import varcode

    variants = varcode.load_vcf(in_file, genome=genome_version)

    effects = variants.effects()

    effects = effects.drop_silent_and_noncoding()

    df = []

    for eff in effects:
        if not eff.modifies_protein_sequence:
            continue

        row = OrderedDict(
            (('gene_id', eff.gene.gene_id), ('gene_name', eff.gene.gene_name),
             ('transcript_id', eff.transcript_id), ('transcript_name',
                                                    eff.transcript_name),
             ('protein_id',
              eff.transcript.protein_id), ('chrom',
                                           'chr{}'.format(eff.gene.contig)),
             ('nuc_variant', eff.variant.short_description),
             ('aa_variant', eff.short_description), ('beg', eff.variant.start),
             ('end', eff.variant.end), ('gene_beg', eff.gene.start),
             ('gene_end', eff.gene.end), ('nuc_ref', eff.variant.ref),
             ('nuc_alt', eff.variant.alt), ('aa_ref', getattr(
                 eff, 'aa_ref',
                 '')), ('aa_alt', getattr(eff, 'aa_alt',
                                          '')), ('aa_mutation_beg_offset',
                                                 eff.aa_mutation_start_offset),
             ('aa_mutation_end_offset',
              eff.aa_mutation_end_offset), ('prot_ref',
                                            eff.original_protein_sequence),
             ('prot_alt',
              eff.mutant_protein_sequence), ('effect_type',
                                             str(eff).split("(")[0])))

        df.append(row)

    df = pd.DataFrame(df)

    df['effect_type'] = df['effect_type'].str.lower()

    df = df.drop_duplicates(['protein_id', 'aa_variant'])

    df.to_csv(out_file, compression='gzip', index=False, sep='\t')
예제 #25
0
def run():
    args = parser.parse_args()

    extra_args = {}
    if not args.info_field:
        extra_args["include_info"] = False

    start = time.time()

    if args.pyvcf:
        result = varcode.load_vcf(args.path, allow_extended_nucleotides=True)
    else:
        result = varcode.load_vcf_fast(args.path,
                                       allow_extended_nucleotides=True,
                                       **extra_args)

    print("Loaded %d variants in %0.3f sec. " %
          (len(result), time.time() - start))
    print(result.to_string(limit=5))
예제 #26
0
def test_genome_arg_to_load_vcf_grch37():
    eq_(load_vcf(HG19_VCF_FILENAME),
        load_vcf(
            HG19_VCF_FILENAME,
            genome="grch37",
            convert_ucsc_contig_names=True))
    eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(
        HG19_VCF_FILENAME,
        genome="GRCh37",
        convert_ucsc_contig_names=True))

    assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
        HG19_VCF_FILENAME,
        genome="grch37",
        convert_ucsc_contig_names=False)
예제 #27
0
def run():
    args = parser.parse_args()

    extra_args = {}
    if not args.info_field:
        extra_args["include_info"] = False

    start = time.time()

    if args.pyvcf:
        result = varcode.load_vcf(
            args.path,
            allow_extended_nucleotides=True)
    else:
        result = varcode.load_vcf_fast(
            args.path,
            allow_extended_nucleotides=True,
            **extra_args)

    print("Loaded %d variants in %0.3f sec. " % (
        len(result), time.time() - start))
    print(result.to_string(limit=5))
예제 #28
0
def test_vcf_gene_names():
    variants = load_vcf(VCF_FILENAME)
    for variant in variants:
        yield (_check_variant_gene_name, variants, variant)
예제 #29
0
def load_vcf(vcf_path, genome=None):
    return varcode.load_vcf(data_path(vcf_path), genome=genome)
예제 #30
0
def test_vcf_number_entries():
    # there are 14 mutations listed in the VCF, make sure they are all parsed
    variants = load_vcf(VCF_FILENAME)
    assert len(variants) == 14, \
        "Expected 14 mutations, got %d" % (len(variants),)
예제 #31
0
def test_vcf_number_entries():
    # there are 14 mutations listed in the VCF, make sure they are all parsed
    variants = load_vcf(VCF_FILENAME)
    assert len(variants) == 14, \
        "Expected 14 mutations, got %d" % (len(variants),)
예제 #32
0
def test_vcf_reference_name():
    variants = load_vcf(VCF_FILENAME)
    # after normalization, hg19 should be remapped to GRCh37
    assert variants.reference_names() == {"GRCh37"}
예제 #33
0
def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)
예제 #34
0
def test_load_vcf_mouse_with_inferred_genome():
    variants = load_vcf(MOUSE_VCF)
    eq_(len(variants), 217)
예제 #35
0
def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
예제 #36
0
def test_load_vcf_mouse_with_explicit_urls():
    variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
    eq_(len(variants), 217)
예제 #37
0
파일: main.py 프로젝트: Saintyven/isovar
def run_isovar(
        variants,
        alignment_file,
        transcript_id_whitelist=None,
        read_collector=None,
        protein_sequence_creator=None,
        filter_thresholds=DEFAULT_FILTER_THRESHOLDS,
        filter_flags=DEFAULT_FILTER_FLAGS,
        min_shared_fragments_for_phasing=MIN_SHARED_FRAGMENTS_FOR_PHASING,
        decompression_threads=1):
    """
    This is the main entrypoint into the Isovar library, which collects
    RNA reads supporting variants and translates their coding sequence
    into amino acid sequences. Collects both the read evidence and
    protein sequences into IsovarResult objects. The values of any filters
    which are supplied in the filter_thresholds argument are attached to
    each IsovarResult's filter_values_dict field.

    Parameters
    ----------
    variants : varcode.VariantCollection
        Somatic variants

    alignment_file : pysam.AlignmentFile
        Aligned tumor RNA reads

    transcript_id_whitelist : set of str or None
        Which transcripts should be considered when predicting DNA-only
        coding effects of mutations and also when trying to establish a
        reading frame for identified cDNA sequences.

    read_collector : ReadCollector or None
        Object used to collect ReadEvidence for each variant, created
        with default settings if not supplied.

    protein_sequence_creator : ProteinSequenceCreator or None
        Object used to turn (Variant, ReadEvidence) into one or more
        ProteinSequence objects. Created with default settings if not
        supplied.

    filter_thresholds : dict or OrderedDict
        Dictionary whose entries have names like "min_num_alt_reads"
        mapping to a numerical threshold value. In general, the keys
        must start with either "min_" or "max_" followed by a property
        of the IsovarResult class.

    filter_flags : list of str
        List of boolean fields of IsovarResult used for filtering,
        they can also be negated by prepending "not_",
        such as "not_has_protein_sequence".

    decompress_threads : int
        Number of threads used by htslib to decompress BAM/CRAM
        files.

    Generator of IsovarResult objects, one for each variant. The
    `protein_sequences` field of the IsovarVar result will be empty
    if no sequences could be determined.
    """
    if isinstance(variants, string_types):
        variants = load_vcf(variants)

    if isinstance(alignment_file, string_types):
        alignment_file = AlignmentFile(alignment_file,
                                       threads=decompression_threads)

    if read_collector is None:
        read_collector = ReadCollector()

    if protein_sequence_creator is None:
        protein_sequence_creator = ProteinSequenceCreator()

    # create generator which returns (Variant, ReadEvidence) pairs
    read_evidence_gen = \
        read_collector.read_evidence_generator(
           variants=variants,
           alignment_file=alignment_file)

    results = []
    for variant, read_evidence in read_evidence_gen:
        # generate protein sequences by assembling variant reads
        protein_sequences = \
            protein_sequence_creator.sorted_protein_sequences_for_variant(
                variant=variant,
                read_evidence=read_evidence,
                transcript_id_whitelist=transcript_id_whitelist)
        predicted_effect = top_varcode_effect(
            variant=variant, transcript_id_whitelist=transcript_id_whitelist)
        isovar_result = IsovarResult(
            variant=variant,
            predicted_effect=predicted_effect,
            read_evidence=read_evidence,
            sorted_protein_sequences=protein_sequences)
        isovar_result = apply_filters(isovar_result,
                                      filter_thresholds=filter_thresholds,
                                      filter_flags=filter_flags)
        results.append(isovar_result)
    results = annotate_phased_variants(results,
                                       min_shared_fragments_for_phasing)
    return results
예제 #38
0
def test_vcf_reference_name():
    variants = load_vcf(VCF_FILENAME)
    # after normalization, hg19 should be remapped to GRCh37
    assert variants.reference_names() == {"GRCh37"}
예제 #39
0
def test_load_vcf_mouse_with_ensembl_release():
    variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
    eq_(len(variants), 217)
예제 #40
0
def test_load_vcf_mouse_with_inferred_genome():
    variants = load_vcf(MOUSE_VCF)
    eq_(len(variants), 217)
예제 #41
0
def test_genome_arg_to_load_vcf_hg19():
    eq_(load_vcf(HG19_VCF_FILENAME),
        load_vcf(HG19_VCF_FILENAME, genome="hg19"))
예제 #42
0
def load_vcf(vcf_path, genome=None):
    return varcode.load_vcf(data_path(vcf_path), genome=genome)
예제 #43
0
def test_genome_arg_to_load_vcf_b37():
    eq_(load_vcf(HG19_VCF_FILENAME),
        load_vcf(HG19_VCF_FILENAME, genome="b37", convert_ucsc_contig_names=True))
예제 #44
0
def test_vcf_gene_names():
    variants = load_vcf(VCF_FILENAME)
    for variant in variants:
        yield (_check_variant_gene_name, variants, variant)