def test_genome_arg_to_load_vcf_cached_75(): eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(HG19_VCF_FILENAME, genome=cached_release(75), convert_ucsc_contig_names=True)) assert load_vcf(HG19_VCF_FILENAME) != load_vcf( HG19_VCF_FILENAME, genome=cached_release(75), convert_ucsc_contig_names=False)
def test_load_vcf_external(): variants = load_vcf(VCF_EXTERNAL_URL) assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14 variants = load_vcf(VCF_EXTERNAL_URL + ".gz") assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14
def test_vcf_number_entries_duplicates(): # There are 3 duplicated mutations listed in the VCF path_to_vcf_with_duplicates = data_path("duplicates.vcf") variants = load_vcf(path_to_vcf_with_duplicates, genome='hg38', distinct=True) assert len(variants) == 1 variants = load_vcf(path_to_vcf_with_duplicates, genome='hg38', distinct=False) assert len(variants) == 3
def test_load_vcf_external(): variants = load_vcf(HG19_VCF_FILENAME) eq_(variants.reference_names(), {"GRCh37"}) eq_(variants.original_reference_names(), {"hg19"}) eq_(len(variants), 14) variants = load_vcf(HG19_VCF_FILENAME + ".gz") eq_(variants.reference_names(), {"GRCh37"}) eq_(len(variants), 14)
def test_genome_arg_to_load_vcf_int_75(): # if we use Ensembl 75 -- which is backed by GRCh37 -- then the two variant # collections will be the same as long as we also convert the contig names eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(HG19_VCF_FILENAME, genome=75, convert_ucsc_contig_names=True)) assert load_vcf(HG19_VCF_FILENAME) != load_vcf( HG19_VCF_FILENAME, genome=75, convert_ucsc_contig_names=False)
def test_sample_info_genotype(): variants = load_vcf(data_path("multiallelic.vcf")) assert len(variants) == 2, "Expected 2 variants but got %s" % variants eq_(variants.metadata[variants[0]]['sample_info']['metastasis']['GT'], '0/1') eq_(variants.metadata[variants[1]]['sample_info']['metastasis']['GT'], '0/1')
def _do_roundtrip_test(filenames): def load_fn(filename): return { 'vcf': load_vcf, 'maf': load_maf }[filename.split('.')[-1]] def load_variants(): variant_collections = [] for filename in filenames: variant_collections.append(load_fn(filename)(data_path(filename))) return variant_collections[0].union(*variant_collections[1:]) variants = load_variants() with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: metadata = _merge_metadata_naive(variants) variants_to_vcf(variants, metadata, out=f) tmp_name = f.name reparsed_variants = load_vcf(tmp_name) # `==` checks the reference genome, which won't necessarily match. assert all( v1.contig == v2.contig and v1.start == v2.start and v1.ref == v2.ref and v1.start == v2.start for (v1, v2) in zip(variants, reparsed_variants)) return (variants, reparsed_variants)
def _do_roundtrip_test(filenames): def load_fn(filename): return {'vcf': load_vcf, 'maf': load_maf}[filename.split('.')[-1]] def load_variants(): variant_collections = [] for filename in filenames: variant_collections.append(load_fn(filename)(data_path(filename))) return variant_collections[0].union(*variant_collections[1:]) variants = load_variants() with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: metadata = _merge_metadata_naive(variants) variants_to_vcf(variants, metadata, out=f) tmp_name = f.name reparsed_variants = load_vcf(tmp_name) # `==` checks the reference genome, which won't necessarily match. assert all( v1.contig == v2.contig and \ v1.start == v2.start and \ v1.ref == v2.ref and \ v1.start == v2.start \ for (v1, v2) in zip(variants, reparsed_variants)) return (variants, reparsed_variants)
def guess_ensembl_release(filepath): try: release = varcode.load_vcf(filepath)[0].ensembl.release except ValueError: # no guesses from varcode, return default release = config.ENSEMBL_RELEASE except Exception: # varcode cannot handle this one, so go w/ default release = config.ENSEMBL_RELEASE finally: return release
def test_multiple_alleles_per_line(): variants = load_vcf(data_path("multiallelic.vcf")) assert len(variants) == 2, "Expected 2 variants but got %s" % variants variant_list = list(variants) expected_variants = [ Variant(1, 1431105, "A", "C", genome="GRCh37"), Variant(1, 1431105, "A", "G", genome="GRCh37"), ] eq_(set(variant_list), set(expected_variants))
def do_test(kwargs): vcf_pandas = load_vcf_fast(**kwargs) vcf_pyvcf = load_vcf(**kwargs) eq_(vcf_pandas, vcf_pyvcf) eq_(len(vcf_pandas), len(vcf_pyvcf)) eq_(vcf_pandas.elements, vcf_pyvcf.elements) eq_(vcf_pandas.metadata, vcf_pyvcf.metadata) assert len(vcf_pandas) > 1 assert len(vcf_pyvcf) > 1
def test_multiple_alleles_per_line(): variants = load_vcf(data_path("multiallelic.vcf")) assert len(variants) == 2, "Expected 2 variants but got %s" % variants variant_list = list(variants) ensembl = variant_list[0].ensembl expected_variants = [ Variant(1, 1431105, "A", "C", ensembl=ensembl), Variant(1, 1431105, "A", "G", ensembl=ensembl), ] eq_(set(variant_list), set(expected_variants))
def variant_collection_from_args(args): variant_collections = [] if args.reference_name: genome = genome_for_reference_name(args.reference_name) else: # no genome specified, assume it can be inferred from the file(s) # we're loading genome = None for vcf_path in args.vcf: vcf_variants = load_vcf(vcf_path, genome=genome) variant_collections.append(vcf_variants) for maf_path in args.maf: maf_variants = load_maf(maf_path) variant_collections.append(maf_variants) if args.variant: if not genome: raise ValueError( "--reference-name must be specified when using --variant") variants = [ Variant( chromosome, start=position, ref=ref, alt=alt, ensembl=genome) for (chromosome, position, ref, alt) in args.variant ] variant_collection = VariantCollection(variants) variant_collections.append(variant_collection) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, or --variant options)") for json_path in args.json_variant_files: with open(json_path, 'r') as f: json_string = f.read() variant_collections.append( VariantCollection.from_json(json_string)) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, --json-variants options)") elif len(variant_collections) == 1: return variant_collections[0] else: combined_variants = [] for variant_collection in variant_collections: combined_variants.extend(list(variant_collection)) return VariantCollection(combined_variants)
def load_variants(vcf_file=None, maf_file=None, max_variants=None): """Load variants from vcf file""" import varcode if vcf_file is not None: variants = varcode.load_vcf(vcf_file, allow_extended_nucleotides=True, max_variants=max_variants) f=vcf_file elif maf_file is not None: variants = varcode.load_maf(maf_file) f=maf_file print ('%s variants read from %s' %(len(variants),f)) return variants
def test_genome_arg_to_load_vcf(): variants = load_vcf(VCF_FILENAME) eq_(variants, load_vcf(VCF_FILENAME, genome=75)) eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75))) eq_(variants, load_vcf(VCF_FILENAME, genome="grch37")) eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37")) eq_(variants, load_vcf(VCF_FILENAME, genome="b37")) # TODO: actually make hg19 different from b37! They should use # different MT sequences eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
def variant_collection_from_args(args): variant_collections = [] if args.reference_name: genome = genome_for_reference_name(args.reference_name) else: # no genome specified, assume it can be inferred from the file(s) # we're loading genome = None for vcf_path in args.vcf: vcf_variants = load_vcf(vcf_path, genome=genome) variant_collections.append(vcf_variants) for maf_path in args.maf: maf_variants = load_maf(maf_path) variant_collections.append(maf_variants) if args.variant: if not genome: raise ValueError( "--reference-name must be specified when using --variant") variants = [ Variant(chromosome, start=position, ref=ref, alt=alt, ensembl=genome) for (chromosome, position, ref, alt) in args.variant ] variant_collection = VariantCollection(variants) variant_collections.append(variant_collection) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, or --variant options)") for json_path in args.json_variant_files: with open(json_path, 'r') as f: json_string = f.read() variant_collections.append( VariantCollection.from_json(json_string)) if len(variant_collections) == 0: raise ValueError( "No variants loaded (use --maf, --vcf, --json-variants options)") elif len(variant_collections) == 1: return variant_collections[0] else: combined_variants = [] for variant_collection in variant_collections: combined_variants.extend(list(variant_collection)) return VariantCollection(combined_variants)
def test_load_vcf_local(): variants = load_vcf(VCF_FILENAME) assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14 variants = load_vcf(VCF_FILENAME + ".gz") assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14 variants = load_vcf("file://%s" % VCF_FILENAME) assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14 variants = load_vcf("file://%s.gz" % VCF_FILENAME) assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14 # An extra slashe before an absolute path can confuse URL parsing. # Test that it can still be opened: variants = load_vcf("/%s" % VCF_FILENAME) assert variants.reference_names() == {"GRCh37"} assert len(variants) == 14
def _build_variant_table(in_file, out_file, genome_version='GRCh37'): import varcode variants = varcode.load_vcf(in_file, genome=genome_version) effects = variants.effects() effects = effects.drop_silent_and_noncoding() df = [] for eff in effects: if not eff.modifies_protein_sequence: continue row = OrderedDict( (('gene_id', eff.gene.gene_id), ('gene_name', eff.gene.gene_name), ('transcript_id', eff.transcript_id), ('transcript_name', eff.transcript_name), ('protein_id', eff.transcript.protein_id), ('chrom', 'chr{}'.format(eff.gene.contig)), ('nuc_variant', eff.variant.short_description), ('aa_variant', eff.short_description), ('beg', eff.variant.start), ('end', eff.variant.end), ('gene_beg', eff.gene.start), ('gene_end', eff.gene.end), ('nuc_ref', eff.variant.ref), ('nuc_alt', eff.variant.alt), ('aa_ref', getattr( eff, 'aa_ref', '')), ('aa_alt', getattr(eff, 'aa_alt', '')), ('aa_mutation_beg_offset', eff.aa_mutation_start_offset), ('aa_mutation_end_offset', eff.aa_mutation_end_offset), ('prot_ref', eff.original_protein_sequence), ('prot_alt', eff.mutant_protein_sequence), ('effect_type', str(eff).split("(")[0]))) df.append(row) df = pd.DataFrame(df) df['effect_type'] = df['effect_type'].str.lower() df = df.drop_duplicates(['protein_id', 'aa_variant']) df.to_csv(out_file, compression='gzip', index=False, sep='\t')
def run(): args = parser.parse_args() extra_args = {} if not args.info_field: extra_args["include_info"] = False start = time.time() if args.pyvcf: result = varcode.load_vcf(args.path, allow_extended_nucleotides=True) else: result = varcode.load_vcf_fast(args.path, allow_extended_nucleotides=True, **extra_args) print("Loaded %d variants in %0.3f sec. " % (len(result), time.time() - start)) print(result.to_string(limit=5))
def test_genome_arg_to_load_vcf_grch37(): eq_(load_vcf(HG19_VCF_FILENAME), load_vcf( HG19_VCF_FILENAME, genome="grch37", convert_ucsc_contig_names=True)) eq_(load_vcf(HG19_VCF_FILENAME), load_vcf( HG19_VCF_FILENAME, genome="GRCh37", convert_ucsc_contig_names=True)) assert load_vcf(HG19_VCF_FILENAME) != load_vcf( HG19_VCF_FILENAME, genome="grch37", convert_ucsc_contig_names=False)
def run(): args = parser.parse_args() extra_args = {} if not args.info_field: extra_args["include_info"] = False start = time.time() if args.pyvcf: result = varcode.load_vcf( args.path, allow_extended_nucleotides=True) else: result = varcode.load_vcf_fast( args.path, allow_extended_nucleotides=True, **extra_args) print("Loaded %d variants in %0.3f sec. " % ( len(result), time.time() - start)) print(result.to_string(limit=5))
def test_vcf_gene_names(): variants = load_vcf(VCF_FILENAME) for variant in variants: yield (_check_variant_gene_name, variants, variant)
def load_vcf(vcf_path, genome=None): return varcode.load_vcf(data_path(vcf_path), genome=genome)
def test_vcf_number_entries(): # there are 14 mutations listed in the VCF, make sure they are all parsed variants = load_vcf(VCF_FILENAME) assert len(variants) == 14, \ "Expected 14 mutations, got %d" % (len(variants),)
def test_vcf_reference_name(): variants = load_vcf(VCF_FILENAME) # after normalization, hg19 should be remapped to GRCh37 assert variants.reference_names() == {"GRCh37"}
def test_load_vcf_mouse_with_explicit_urls(): variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome) eq_(len(variants), 217)
def test_load_vcf_mouse_with_inferred_genome(): variants = load_vcf(MOUSE_VCF) eq_(len(variants), 217)
def test_load_vcf_mouse_with_ensembl_release(): variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome) eq_(len(variants), 217)
def run_isovar( variants, alignment_file, transcript_id_whitelist=None, read_collector=None, protein_sequence_creator=None, filter_thresholds=DEFAULT_FILTER_THRESHOLDS, filter_flags=DEFAULT_FILTER_FLAGS, min_shared_fragments_for_phasing=MIN_SHARED_FRAGMENTS_FOR_PHASING, decompression_threads=1): """ This is the main entrypoint into the Isovar library, which collects RNA reads supporting variants and translates their coding sequence into amino acid sequences. Collects both the read evidence and protein sequences into IsovarResult objects. The values of any filters which are supplied in the filter_thresholds argument are attached to each IsovarResult's filter_values_dict field. Parameters ---------- variants : varcode.VariantCollection Somatic variants alignment_file : pysam.AlignmentFile Aligned tumor RNA reads transcript_id_whitelist : set of str or None Which transcripts should be considered when predicting DNA-only coding effects of mutations and also when trying to establish a reading frame for identified cDNA sequences. read_collector : ReadCollector or None Object used to collect ReadEvidence for each variant, created with default settings if not supplied. protein_sequence_creator : ProteinSequenceCreator or None Object used to turn (Variant, ReadEvidence) into one or more ProteinSequence objects. Created with default settings if not supplied. filter_thresholds : dict or OrderedDict Dictionary whose entries have names like "min_num_alt_reads" mapping to a numerical threshold value. In general, the keys must start with either "min_" or "max_" followed by a property of the IsovarResult class. filter_flags : list of str List of boolean fields of IsovarResult used for filtering, they can also be negated by prepending "not_", such as "not_has_protein_sequence". decompress_threads : int Number of threads used by htslib to decompress BAM/CRAM files. Generator of IsovarResult objects, one for each variant. The `protein_sequences` field of the IsovarVar result will be empty if no sequences could be determined. """ if isinstance(variants, string_types): variants = load_vcf(variants) if isinstance(alignment_file, string_types): alignment_file = AlignmentFile(alignment_file, threads=decompression_threads) if read_collector is None: read_collector = ReadCollector() if protein_sequence_creator is None: protein_sequence_creator = ProteinSequenceCreator() # create generator which returns (Variant, ReadEvidence) pairs read_evidence_gen = \ read_collector.read_evidence_generator( variants=variants, alignment_file=alignment_file) results = [] for variant, read_evidence in read_evidence_gen: # generate protein sequences by assembling variant reads protein_sequences = \ protein_sequence_creator.sorted_protein_sequences_for_variant( variant=variant, read_evidence=read_evidence, transcript_id_whitelist=transcript_id_whitelist) predicted_effect = top_varcode_effect( variant=variant, transcript_id_whitelist=transcript_id_whitelist) isovar_result = IsovarResult( variant=variant, predicted_effect=predicted_effect, read_evidence=read_evidence, sorted_protein_sequences=protein_sequences) isovar_result = apply_filters(isovar_result, filter_thresholds=filter_thresholds, filter_flags=filter_flags) results.append(isovar_result) results = annotate_phased_variants(results, min_shared_fragments_for_phasing) return results
def test_genome_arg_to_load_vcf_hg19(): eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(HG19_VCF_FILENAME, genome="hg19"))
def test_genome_arg_to_load_vcf_b37(): eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(HG19_VCF_FILENAME, genome="b37", convert_ucsc_contig_names=True))