def create_record(self, values): """Create new entry in dbVar SV table.""" self.fh_tsv.write( "\t".join( [ self.genome_release, values["chr"], values["outermost_start"], values["outermost_stop"], str(binning.assign_bin( int(values["outermost_start"]) - 1, int(values["outermost_stop"]) )), values["variant_count"], values["variant_type"], values["method"], values["analysis"], values["platform"], values["study"], list_to_str(values.get("clinical_assertion", "").split(";")), list_to_str(values.get("clinvar_accessions", "").split(";")), values["bin_size"], values.get("min_insertion_length", ""), values.get("max_insertion_length", ""), ] ) + "\n" )
def convert(self): self.fh_tsv.write(self.header + "\n") with open(self.path, "rt") as inputf: chrom = None var_type = None for line in inputf: if line and line[-1] == "\n": line = line[:-1] if line.startswith("track"): if line.split()[1].startswith("name=delControls"): var_type = "deletion" else: if not line.split()[1].startswith("name=dupControls"): raise Exception("Unexpected track line: {}".format(line)) var_type = "duplication" else: arr = line.split() self.fh_tsv.write( "\t".join([ self.genome_release, arr[0][len("chr") :], arr[1], arr[2], str(binning.assign_bin(int(arr[1]) - 1, int(arr[2]))), var_type, arr[3].split("-")[1], arr[4], ]) + "\n" ) # read next line if chrom != arr[0]: print( "Starting sv type {} on contig {}".format(var_type, arr[0]) ) chrom = arr[0]
def _import_elements(self, element_types, reg_map, path_bed): header = None with path_bed.open("rt") as inputf: for line in inputf: line = line.strip() arr = line.split("\t") if not header: header = arr continue chrom, begin, end, et_slug, score = arr[:5] begin = int(begin) end = int(end) elem_type = element_types[et_slug] score = float("NaN") if score in ("", ".", "-") else float(score) if len(arr) > 5: extra_data = json.loads(arr[5]) else: extra_data = None RegElement.objects.create( reg_map=reg_map, elem_type=elem_type, release=reg_map.collection.release, chromosome=chrom, start=begin + 1, end=end, bin=binning.assign_bin(begin, end), score=score, extra_data=extra_data, )
class RegInteractionFactory(factory.django.DjangoModelFactory): """Factory for ``RegInteraction`` records.""" class Meta: model = RegInteraction reg_map = factory.SubFactory(RegMapFactory) release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 1000) end = factory.Sequence(lambda n: (n + 1) * 1500 + 100) bin = factory.LazyAttribute( lambda obj: binning.assign_bin(obj.start, obj.end)) score = 1.0 chromosome1 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start1 = factory.Sequence(lambda n: (n + 1) * 1000) end1 = factory.Sequence(lambda n: (n + 1) * 1000 + 100) chromosome2 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start2 = factory.Sequence(lambda n: (n + 1) * 1500) end2 = factory.Sequence(lambda n: (n + 1) * 1500 + 100) extra_data = None @factory.post_generation def fix_bins(obj, *args, **kwargs): obj.bin = binning.assign_bin(obj.start - 1, obj.end) obj.save()
def __init__(self, chromosome, reference_type, accession, gene, orientation, start, stop, exon_starts, exon_stops, source, transcript=1, cds=None, select_transcript=False, version=None): self.chromosome = chromosome self.reference_type = reference_type self.accession = accession self.gene = gene self.orientation = orientation self.start = start self.stop = stop self.exon_starts = exon_starts self.exon_stops = exon_stops self.source = source self.transcript = transcript self.cds = cds self.select_transcript = select_transcript self.version = version self.bin = binning.assign_bin(self.start - 1, self.stop)
class StructuralVariantFactory(factory.django.DjangoModelFactory): class Meta: model = StructuralVariant exclude = ["case", "variant_set"] class Params: #: The genotypes to create, by default only first is het. the rest is wild-type. genotypes = default_genotypes release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) chromosome_no = factory.Iterator(list(range(1, 25))) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.LazyAttribute(lambda obj: binning.assign_bin(obj.start, obj.end)) start_ci_left = -100 start_ci_right = 100 end_ci_left = -100 end_ci_right = 100 #: Model pseudo-attribute, not stored in database. Instead, ``set_id`` is stored. variant_set = factory.SubFactory(StructuralVariantSetFactory) #: The actual reference to the ``StructuralVariantSet``. set_id = factory.LazyAttribute(lambda o: o.variant_set.id) #: Model pseudo-attribute, not stored in database. Instead ``case_id`` is stored. case = factory.LazyAttribute(lambda obj: Case.objects.get(id=obj.case_id)) #: The actual foreign key to the ``Case``. case_id = factory.SelfAttribute("variant_set.case.id") caller = "DELLYv4001" sv_type = "DEL" sv_sub_type = "DEL" genotype = factory.LazyAttribute( lambda obj: { line["patient"]: {"gt": gt, "gq": 10, "src": 10, "srv": 5, "pec": 10, "pev": 5} for line, gt in zip(obj.case.pedigree, obj.genotypes()) } ) @factory.lazy_attribute def info(self): num_affected = 0 num_unaffected = 0 for line, gt in zip(self.case.pedigree, self.genotypes()): if "1" in gt: if line.get("affected") == 2: num_affected += 1 else: num_affected += 1 return { "affectedCarriers": num_affected, "unaffectedCarriers": num_unaffected, "backgroundCarriers": 0, }
def test_containing(intervals, interval): start, stop = interval # Intervals completely containing the query interval. containing = set((x, y) for x, y in intervals if x <= start and stop <= y) # Pre-selection of intervals using binning. binned = set((x, y) for x, y in intervals if binning.assign_bin(x, y) in binning.containing_bins(start, stop)) assert binned.issuperset(containing)
def test_contained(intervals, interval): start, stop = interval # Intervals completely contained by the query interval. contained = set((x, y) for x, y in intervals if start <= x and y <= stop) # Pre-selection of intervals using binning. binned = set((x, y) for x, y in intervals if binning.assign_bin(x, y) in binning.contained_bins(start, stop)) assert binned.issuperset(contained)
def test_overlapping(intervals, interval): start, stop = interval # Intervals overlapping the query interval. overlapping = set((x, y) for x, y in intervals if x < stop and start < y) # Pre-selection of intervals using binning. binned = set((x, y) for x, y in intervals if binning.assign_bin(x, y) in binning.overlapping_bins(start, stop)) assert binned.issuperset(overlapping)
def upgrade(): # We want to add a NOT NULL column without default value. So we first add # the column without the constraint, then populate it, then add the # constraint. # Unfortunately, SQLite doesn't support adding the constraint on an # existing column. We use batch_alter_table to workaround this. Of course # this makes the entire migration horribly awkward on SQLite, but I can't # really be bothered to improve it. This works. # Also, the downgrade will fail on SQLite, but we don't support downgrades # anyway, so I'm not fixing it. connection = op.get_bind() op.add_column('transcript_mappings', sa.Column('bin', sa.Integer(), nullable=True)) transcript_mappings = sql.table('transcript_mappings', sql.column('id', sa.Integer()), sql.column('start', sa.Integer()), sql.column('stop', sa.Integer()), sql.column('bin', sa.Integer())) result = connection.execute(transcript_mappings.select().with_only_columns( [ transcript_mappings.c.id, transcript_mappings.c.start, transcript_mappings.c.stop ])) while True: chunk = result.fetchmany(1000) if not chunk: break statement = transcript_mappings.update().where( transcript_mappings.c.id == sql.bindparam('m_id')).values( {'bin': sql.bindparam('m_bin')}) connection.execute( statement, [{ 'm_id': m.id, 'm_bin': binning.assign_bin(m.start - 1, m.stop) } for m in chunk]) # See note above. with op.batch_alter_table('transcript_mappings') as batch_op: batch_op.alter_column('bin', nullable=False, existing_type=sa.Integer()) op.create_index(op.f('ix_transcript_mappings_bin'), 'transcript_mappings', ['bin'], unique=False)
def test_clinvar_query_fail(self): created, query = self.create(ClinvarFactory) self.run_get_query( Clinvar, { **query, **{ "start": created.start + 1, "end": created.end + 1, "bin": binning.assign_bin(created.start, created.end + 1), }, }, Clinvar.DoesNotExist, )
def __init__(self, variation, chromosome, position, reference, observed, zygosity=None, support=1): self.variation = variation self.chromosome = chromosome self.position = position self.reference = reference self.observed = observed # We choose the 'region' of the reference covered by an insertion to # be the base next to it. self.bin = binning.assign_bin( self.position - 1, self.position + max(1, len(self.reference)) - 1) self.zygosity = zygosity self.support = support
def run(args): header = next(args.input) args.output.write(header) for record in tsv_reader(args.input, header): if record["end"] == "": try: record["end"] = str( int(record["start"]) + len(record["reference"]) - 1) except KeyError: raise KeyError( "Please make sure `end` column is filled when not providing `alternative` column." ) record["bin"] = str( binning.assign_bin(int(record["start"]) - 1, int(record["end"]))) args.output.write("%s\n" % "\t".join(v for v in record.values()))
class ExacCnvFactory(factory.django.DjangoModelFactory): class Meta: model = ExacCnv release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100)) sv_type = "DEL" population = factory.Iterator([x[0] for x in EXAC_POP_CHOICES]) phred_score = factory.Iterator(list(range(30)))
class _UserAnnotationFactory(factory.django.DjangoModelFactory): class Meta: abstract = True bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100)) release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) sv_type = "DEL" sv_sub_type = "DEL" # user = factory.SubFactory(UserFactory) # TODO case = factory.SubFactory(CaseFactory)
def upgrade(): # We want to add a NOT NULL column without default value. So we first add # the column without the constraint, then populate it, then add the # constraint. # Unfortunately, SQLite doesn't support adding the constraint on an # existing column. We use batch_alter_table to workaround this. Of course # this makes the entire migration horribly awkward on SQLite, but I can't # really be bothered to improve it. This works. # Also, the downgrade will fail on SQLite, but we don't support downgrades # anyway, so I'm not fixing it. connection = op.get_bind() op.add_column('transcript_mappings', sa.Column('bin', sa.Integer(), nullable=True)) transcript_mappings = sql.table('transcript_mappings', sql.column('id', sa.Integer()), sql.column('start', sa.Integer()), sql.column('stop', sa.Integer()), sql.column('bin', sa.Integer())) result = connection.execute( transcript_mappings.select().with_only_columns([ transcript_mappings.c.id, transcript_mappings.c.start, transcript_mappings.c.stop])) while True: chunk = result.fetchmany(1000) if not chunk: break statement = transcript_mappings.update().where( transcript_mappings.c.id == sql.bindparam('m_id') ).values({'bin': sql.bindparam('m_bin')}) connection.execute(statement, [ {'m_id': m.id, 'm_bin': binning.assign_bin(m.start - 1, m.stop)} for m in chunk]) # See note above. with op.batch_alter_table('transcript_mappings') as batch_op: batch_op.alter_column('bin', nullable=False, existing_type=sa.Integer()) op.create_index(op.f('ix_transcript_mappings_bin'), 'transcript_mappings', ['bin'], unique=False)
def write_to_tsv(self, values): """Insert record into database.""" self.fh_tsv.write( "\t".join([ self.genome_release, values["chr"], values["start"], values["end"], str(binning.assign_bin(int(values["start"]) - 1, int(values["end"]))), values["variantaccession"], values["varianttype"], values["variantsubtype"], values["reference"], list_to_str(values["platform"].split(",")), values["samplesize"] or "0", values["observedgains"] or "0", values["observedlosses"] or "0", ]) + "\n" )
def write_to_tsv(self, values): """Insert record into database.""" attributes = values["attributes"] pop_sum = { key: value for key, value in [x.split(" ") for x in attributes["PopulationSummary"].split(":")] } self.fh_tsv.write( "\t".join([ self.genome_release, values["seqid"], attributes["outer_start"], attributes["inner_start"], attributes["inner_end"], attributes["outer_end"], str(binning.assign_bin( int(attributes["outer_start"]) - 1, int(attributes["outer_end"]) )), attributes["ID"], attributes["variant_type"], attributes["variant_sub_type"], attributes["num_studies"], list_to_str(attributes["Studies"].split(",")), attributes["num_platforms"], list_to_str(attributes["Platforms"].split(",")), attributes["number_of_algorithms"], list_to_str(attributes["algorithms"].split(",")), attributes["num_variants"], attributes["num_samples"], attributes["num_unique_samples_tested"], pop_sum["African"], pop_sum["Asian"], pop_sum["European"], pop_sum["Mexican"], pop_sum["MiddleEast"], pop_sum["NativeAmerican"], pop_sum["NorthAmerican"], pop_sum["Oceania"], pop_sum["SouthAmerican"], pop_sum["Admixed"], pop_sum["Unknown"], ]) + "\n" )
def _import_interactions(self, reg_map, path_bed): header = None with path_bed.open("rt") as inputf: for line in inputf: line = line.strip() arr = line.split("\t") if not header: header = arr continue chrom, begin, end, score, chrom1, begin1, end1, chrom2, begin2, end2 = arr[: 10] begin = int(begin) end = int(end) begin1 = int(begin1) end1 = int(end1) begin2 = int(begin2) end2 = int(end2) score = float("NaN") if score in ("", ".", "-") else float(score) if len(arr) > 5: extra_data = json.loads(arr[10]) else: extra_data = None RegInteraction.objects.create( reg_map=reg_map, release=reg_map.collection.release, chromosome=chrom, start=begin + 1, end=end, bin=binning.assign_bin(begin, end), chromosome1=chrom1, start1=begin1 + 1, end1=end1, chromosome2=chrom2, start2=begin2 + 1, end2=end2, score=score, extra_data=extra_data, )
class DgvSvsFactory(factory.django.DjangoModelFactory): class Meta: model = DgvSvs release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100)) accession = factory.Sequence(lambda n: "DGV-%d" % n) sv_type = "DEL" sv_sub_type = "DEL" study = factory.Sequence(lambda n: "DGV-STUDY-%d" % n) platform = factory.Sequence(lambda n: ["DGV-PLATFORM-%d" % n]) num_samples = 1 observed_gains = 0 observed_losses = 1
class RegElementFactory(factory.django.DjangoModelFactory): """Factory for ``RegElement`` records.""" class Meta: model = RegElement reg_map = factory.SubFactory(RegMapFactory) elem_type = factory.LazyAttribute( lambda o: RegElementTypeFactory(collection=o.reg_map.collection)) release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.LazyAttribute( lambda obj: binning.assign_bin(obj.start, obj.end)) score = 1.0 extra_data = None @factory.post_generation def fix_bins(obj, *args, **kwargs): obj.bin = binning.assign_bin(obj.start - 1, obj.end) obj.save()
class DbVarSvFactory(factory.django.DjangoModelFactory): class Meta: model = DbVarSv release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100)) num_carriers = 1 sv_type = "DEL" method = "Sequencing" analysis = "Read_depth" platform = factory.Sequence(lambda n: "DBVAR-PLATFORM-%d" % n) study = factory.Sequence(lambda n: "DBVAR-STUDY-%d" % n) clinical_assertions = [] clinvar_accessions = [] bin_size = "large" min_ins_length = None max_ins_length = None
class DgvGoldStandardSvsFactory(factory.django.DjangoModelFactory): class Meta: model = DgvGoldStandardSvs release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start_outer = factory.Sequence(lambda n: (n + 1) * 100 - 10) start_inner = factory.Sequence(lambda n: (n + 1) * 100 + 10) end_inner = factory.Sequence(lambda n: (n + 1) * 100 + 90) end_outer = factory.Sequence(lambda n: (n + 1) * 100 + 110) bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100 - 11, (n + 1) * 100 + 110)) accession = factory.Sequence(lambda n: "DGV-GS-%d" % n) sv_type = "DEL" sv_sub_type = "DEL" num_studies = 1 studies = factory.Sequence(lambda n: ["DGV-GS-STUDY-%d" % n]) num_platforms = 1 platforms = factory.Sequence(lambda n: ["DGV-GS-PLATFORM-%d" % n]) num_algorithms = 1 algorithms = factory.Sequence(lambda n: ["DGV-GS-ALGO-%d" % n]) num_variants = 1 num_carriers = 1 num_unique_samples = 1 num_carriers_african = 0 num_carriers_asian = 0 num_carriers_european = 0 num_carriers_mexican = 0 num_carriers_middle_east = 1 num_carriers_native_american = 0 num_carriers_north_american = 0 num_carriers_oceania = 0 num_carriers_south_american = 0 num_carriers_admixed = 0 num_carriers_unknown = 0
class ThousandGenomesSvFactory(factory.django.DjangoModelFactory): class Meta: model = ThousandGenomesSv release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100)) start_ci_left = -100 start_ci_right = 100 end_ci_left = -100 end_ci_right = 100 sv_type = "DEL" source_call_set = "DEL_delly" mobile_element_info = [] num_samples = 1 num_alleles = 2 num_var_alleles = 1 num_alleles_afr = 2 num_var_alleles_afr = 1 num_alleles_amr = 0 num_var_alleles_amr = 0 num_alleles_eas = 0 num_var_alleles_eas = 0 num_alleles_eur = 0 num_var_alleles_eur = 0 num_alleles_sas = 0 num_var_alleles_sas = 0
def __init__(self, coverage, chromosome, begin, end): self.coverage = coverage self.chromosome = chromosome self.begin = begin self.end = end self.bin = binning.assign_bin(self.begin - 1, self.end)
def test_assign_bin_covered_interval(start, stop): bin_start, bin_stop = binning.covered_interval(binning.assign_bin(start, stop)) assert bin_start <= start and stop <= bin_stop
def test_assign_bin(start, stop, expected): assert binning.assign_bin(start, stop) == expected
class GnomAdSvFactory(factory.django.DjangoModelFactory): class Meta: model = GnomAdSv release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100)) ref = "N" alt = ["<DUP>"] name = [factory.Sequence(lambda n: "DBVAR-SV-%d" % n)] svtype = "DEL" svlen = 100 filter = ["PASS"] evidence = ["BAF", "RD"] algorithms = ["depth"] chr2 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) cpx_type = None cpx_intervals = [] source = None strands = None unresolved_type = None pcrplus_depleted = False pesr_gt_overdispersion = False protein_coding_lof = [] protein_coding_dup_lof = [] protein_coding_copy_gain = [] protein_coding_dup_partial = [] protein_coding_msv_exon_ovr = [] protein_coding_intronic = [] protein_coding_inv_span = [] protein_coding_utr = [] protein_coding_nearest_tss = [] protein_coding_intergenic = False protein_coding_promoter = [] an = 2 ac = [1] af = [0.5] n_bi_genos = 1 n_homref = 0 n_het = 1 n_homalt = 0 freq_homref = 0.5 freq_het = 0.5 freq_homalt = 0.0 popmax_af = 0.5 afr_an = 1 afr_ac = [1] afr_af = [0.5] afr_n_bi_genos = 0 afr_n_homref = 0 afr_n_het = 0 afr_n_homalt = 0 afr_freq_homref = 0.0 afr_freq_het = 0.0 afr_freq_homalt = 0.0 amr_an = 0 amr_ac = [0] amr_af = [0.0] amr_n_bi_genos = 0 amr_n_homref = 0 amr_n_het = 0 amr_n_homalt = 0 amr_freq_homref = 0.0 amr_freq_het = 0.0 amr_freq_homalt = 0.0 eas_an = 0 eas_ac = [0] eas_af = [0.0] eas_n_bi_genos = 0 eas_n_homref = 0 eas_n_het = 0 eas_n_homalt = 0 eas_freq_homref = 0.0 eas_freq_het = 0.0 eas_freq_homalt = 0.0 eur_an = 0 eur_ac = [0] eur_af = [0.0] eur_n_bi_genos = 0 eur_n_homref = 0 eur_n_het = 0 eur_n_homalt = 0 eur_freq_homref = 0.0 eur_freq_het = 0.0 eur_freq_homalt = 0.0 oth_an = 0 oth_ac = [0] oth_af = [0.0] oth_n_bi_genos = 0 oth_n_homref = 0 oth_n_het = 0 oth_n_homalt = 0 oth_freq_homref = 0.0 oth_freq_het = 0.0 oth_freq_homalt = 0.0
def test_assign__bin_range(start, stop): with pytest.raises(binning.OutOfRangeError): binning.assign_bin(start, stop)
def test_covered_interval_assign_bin(bin): assert binning.assign_bin(*binning.covered_interval(bin)) == bin
def _create_record(self, record): """Create new entry in gnomAD SV table.""" self.fh_tsv.write( "\t".join( [ self.genome_release, record.CHROM, str(record.POS), str(record.INFO.get("END")), str(binning.assign_bin(record.INFO.get("END") - 1, record.POS)), record.REF, list_to_str([alt.serialize() for alt in record.ALT]), list_to_str(record.ID), record.INFO.get("SVTYPE"), str(record.INFO.get("SVLEN")), list_to_str(record.FILTER), list_to_str(record.INFO.get("EVIDENCE")), list_to_str(record.INFO.get("ALGORITHMS")), record.INFO.get("CHR2"), record.INFO.get("CPX_TYPE", ""), list_to_str(record.INFO.get("CPX_INTERVALS", [])), record.INFO.get("SOURCE", ""), record.INFO.get("STRANDS", ""), record.INFO.get("UNRESOLVED_TYPE", ""), str(record.INFO.get("PCRPLUS_DEPLETED", False)), str(record.INFO.get("PESR_GT_OVERDISPERSION", False)), list_to_str(record.INFO.get("PROTEIN_CODING_LOF", [])), list_to_str(record.INFO.get("PROTEIN_CODING__DUP_LOF", [])), list_to_str(record.INFO.get("PROTEIN_CODING__COPY_GAIN", [])), list_to_str(record.INFO.get("PROTEIN_CODING__DUP_PARTIAL", [])), list_to_str(record.INFO.get("PROTEIN_CODING__MSV_EXON_OVR", [])), list_to_str(record.INFO.get("PROTEIN_CODING__INTRONIC", [])), list_to_str(record.INFO.get("PROTEIN_CODING__INV_SPAN", [])), list_to_str(record.INFO.get("PROTEIN_CODING__UTR", [])), list_to_str(record.INFO.get("PROTEIN_CODING__NEAREST_TSS", [])), str(record.INFO.get("PROTEIN_CODING__INTERGENIC", False)), list_to_str(record.INFO.get("PROTEIN_CODING__PROMOTER", [])), str(record.INFO.get("AN")), list_to_str(record.INFO.get("AC", [])), list_to_str(record.INFO.get("AF", [])), str(record.INFO.get("N_BI_GENOS", 0)), str(record.INFO.get("N_HOMREF", 0)), str(record.INFO.get("N_HET", 0)), str(record.INFO.get("N_HOMALT", 0)), str(record.INFO.get("FREQ_HOMREF", 0.0)), str(record.INFO.get("FREQ_HET", 0.0)), str(record.INFO.get("FREQ_HOMALT", 0.0)), str(record.INFO.get("POPMAX_AF", 0.0)), str(record.INFO.get("AFR_AN")), list_to_str(record.INFO.get("AFR_AC", [])), list_to_str(record.INFO.get("AFR_AF", [])), str(record.INFO.get("AFR_N_BI_GENOS", 0)), str(record.INFO.get("AFR_N_HOMREF", 0)), str(record.INFO.get("AFR_N_HET", 0)), str(record.INFO.get("AFR_N_HOMALT", 0)), str(record.INFO.get("AFR_FREQ_HOMREF", 0.0)), str(record.INFO.get("AFR_FREQ_HET", 0.0)), str(record.INFO.get("AFR_FREQ_HOMALT", 0.0)), str(record.INFO.get("AMR_AN")), list_to_str(record.INFO.get("AMR_AC", [])), list_to_str(record.INFO.get("AMR_AF", [])), str(record.INFO.get("AMR_N_BI_GENOS", 0)), str(record.INFO.get("AMR_N_HOMREF", 0)), str(record.INFO.get("AMR_N_HET", 0)), str(record.INFO.get("AMR_N_HOMALT", 0)), str(record.INFO.get("AMR_FREQ_HOMREF", 0.0)), str(record.INFO.get("AMR_FREQ_HET", 0.0)), str(record.INFO.get("AMR_FREQ_HOMALT", 0.0)), str(record.INFO.get("EAS_AN")), list_to_str(record.INFO.get("EAS_AC", [])), list_to_str(record.INFO.get("EAS_AF", [])), str(record.INFO.get("EAS_N_BI_GENOS", 0)), str(record.INFO.get("EAS_N_HOMREF", 0)), str(record.INFO.get("EAS_N_HET", 0)), str(record.INFO.get("EAS_N_HOMALT", 0)), str(record.INFO.get("EAS_FREQ_HOMREF", 0.0)), str(record.INFO.get("EAS_FREQ_HET", 0.0)), str(record.INFO.get("EAS_FREQ_HOMALT", 0.0)), str(record.INFO.get("EUR_AN")), list_to_str(record.INFO.get("EUR_AC", [])), list_to_str(record.INFO.get("EUR_AF", [])), str(record.INFO.get("EUR_N_BI_GENOS", 0)), str(record.INFO.get("EUR_N_HOMREF", 0)), str(record.INFO.get("EUR_N_HET", 0)), str(record.INFO.get("EUR_N_HOMALT", 0)), str(record.INFO.get("EUR_FREQ_HOMREF", 0.0)), str(record.INFO.get("EUR_FREQ_HET", 0.0)), str(record.INFO.get("EUR_FREQ_HOMALT", 0.0)), str(record.INFO.get("OTH_AN")), list_to_str(record.INFO.get("OTH_AC", [])), list_to_str(record.INFO.get("OTH_AF", [])), str(record.INFO.get("OTH_N_BI_GENOS", 0)), str(record.INFO.get("OTH_N_HOMREF", 0)), str(record.INFO.get("OTH_N_HET", 0)), str(record.INFO.get("OTH_N_HOMALT", 0)), str(record.INFO.get("OTH_FREQ_HOMREF", 0.0)), str(record.INFO.get("OTH_FREQ_HET", 0.0)), str(record.INFO.get("OTH_FREQ_HOMALT", 0.0)), ] ) + "\n" )
def fix_bins(obj, *args, **kwargs): obj.bin = binning.assign_bin(obj.start - 1, obj.end) obj.save()
def import_sv_vcf_record(self, panel_map, record): """Import the SV VCF file into the database.""" # Counters super_pops = ("All", "AFR", "AMR", "EAS", "EUR", "SAS") num_samples = 0 num_alleles = {key: 0 for key in super_pops} num_var_alleles = {key: 0 for key in super_pops} # Count statistics for call in record.calls: sample = call.sample gt = call.data.get("GT", ".") super_pop = panel_map[sample]["super_pop"] sex = panel_map[sample]["sex"] # Skip if genotype is no-call if gt == ".": continue # Count alleles contributed by this individual if record.CHROM == "X": this_alleles = 1 if sex == "male" else 2 elif record.CHROM == "Y": this_alleles = 1 if sex == "male" else 0 else: this_alleles = 2 if this_alleles == 0: continue # no alleles contributed by this individual # Increment allele counters num_alleles["All"] += this_alleles num_alleles[super_pop] += this_alleles num_samples += 1 if gt in ("0|0", "0/0"): continue # non-variant allele elif this_alleles == 1: num_var_alleles["All"] += 1 num_alleles[super_pop] += 1 elif "0" in gt: # heterozygous, even if multiallelic (-> CNV) num_var_alleles["All"] += 1 num_alleles[super_pop] += 1 else: # homozygous non-ref, even if multiallelic (-> CNV) num_var_alleles["All"] += 2 num_alleles[super_pop] += 2 # Perform the record creation self.fh_tsv.write( "\t".join( [ self.genome_release, record.CHROM, str(record.POS), str(record.INFO.get("END", record.POS)), str(binning.assign_bin(record.POS - 1, record.INFO.get("END", record.POS))), str(record.INFO.get("CIPOS", (0, 0))[0]), str(record.INFO.get("CIPOS", (0, 0))[1]), str(record.INFO.get("CIEND", (0, 0))[0]), str(record.INFO.get("CIEND", (0, 0))[1]), record.INFO.get("SVTYPE"), record.INFO.get("CS"), list_to_str(record.INFO.get("MEINFO", [])), str(num_samples), str(num_alleles["All"]), str(num_var_alleles["All"]) ] + [str(num_alleles[key]) for key in super_pops if key != "All"] + [str(num_var_alleles[key]) for key in super_pops if key != "All"] ) + "\n" )
def fixture_setup_case1_simple(): """Setup test case 1 -- a singleton with one variant only.""" project = Project.objects.create(**PROJECT_DICT) case = project.case_set.create( sodar_uuid="9b90556b-041e-47f1-bdc7-4d5a4f8357e3", name="A", index="A", pedigree=[{ "sex": 1, "father": "0", "mother": "0", "patient": "A", "affected": 1, "has_gt_entries": True, }], ) SmallVariant.objects.create( case_id=case.pk, release="GRCh37", chromosome="1", start=100, end=100, bin=binning.assign_bin(99, 100), reference="A", alternative="G", var_type="snv", genotype={"A": { "ad": 15, "dp": 30, "gq": 99, "gt": "0/1" }}, in_clinvar=True, # frequencies exac_frequency=0.01, exac_homozygous=0, exac_heterozygous=0, exac_hemizygous=0, thousand_genomes_frequency=0.01, thousand_genomes_homozygous=0, thousand_genomes_heterozygous=0, thousand_genomes_hemizygous=0, gnomad_exomes_frequency=0.01, gnomad_exomes_homozygous=0, gnomad_exomes_heterozygous=0, gnomad_exomes_hemizygous=0, gnomad_genomes_frequency=0.01, gnomad_genomes_homozygous=0, gnomad_genomes_heterozygous=0, gnomad_genomes_hemizygous=0, # RefSeq refseq_gene_id="1234", refseq_transcript_id="NR_00001.1", refseq_transcript_coding=False, refseq_hgvs_c="n.111+2T>C", refseq_hgvs_p="p.=", refseq_effect=["synonymous_variant"], # ENSEMBL ensembl_gene_id="ENGS00001", ensembl_transcript_id="ENST00001", ensembl_transcript_coding=False, ensembl_hgvs_c="n.111+2T>C", ensembl_hgvs_p="p.=", ensembl_effect=["synonymous_variant"], ) SmallVariantSummary.objects.create( release="GRCh37", chromosome="1", start=100, end=100, bin=binning.assign_bin(99, 100), reference="A", alternative="G", count_hom_ref=0, count_het=1, count_hom_alt=1, count_hemi_ref=0, count_hemi_alt=0, ) Hgnc.objects.create(hgnc_id="HGNC:1", symbol="AAA") RefseqToHgnc.objects.create(entrez_id="1234", hgnc_id="HGNC:1") rebuild_case_variant_stats(SQLALCHEMY_ENGINE, case)
def run(self): logger.info("Parsing elements...") out_files = {"GRCh37": self.out_b37, "GRCh38": self.out_b38} for out_file in out_files.values(): print(TSV_HEADER, file=out_file) with tqdm.tqdm(unit="rcvs") as progress: for event, elem in ET.iterparse(self.input): if elem.tag == "ClinVarSet" and event == "end": self.rcvs += 1 clinvar_set = ClinVarSet.from_element(elem) if clinvar_set.ref_cv_assertion.observed_in: origin = clinvar_set.ref_cv_assertion.observed_in.origin else: origin = "." for genotype_set in clinvar_set.ref_cv_assertion.genotype_sets: for measure_set in genotype_set.measure_sets: for measure in measure_set.measures: for build, location in measure.sequence_locations.items( ): if build not in out_files: continue elif location.ref is not None and location.alt is not None: if len(location.ref) == 1 and len( location.alt) == 1: variation_type = "snv" elif len(location.ref) == len( location.alt): variation_type = "mnv" else: variation_type = "indel" row = [ build, location.chrom, location.start, location.stop, binning.assign_bin( location.start - 1, location.stop), location.ref, location.alt, variation_type, as_pg_list(measure.symbols), as_pg_list(measure.hgnc_ids), clinvar_set.ref_cv_assertion.id_no, clinvar_set.ref_cv_assertion. clinvar_accession, clinvar_set.ref_cv_assertion. gold_stars, clinvar_set.ref_cv_assertion. review_status, clinvar_set.ref_cv_assertion. pathogenicity, origin, measure_set.accession, json.dumps( cattr.unstructure(clinvar_set), cls=DateTimeEncoder).replace( r"\"", "'").replace('"', '"""'), ] print("\t".join(map(str, row)), file=out_files[build]) progress.update() elem.clear() if self.max_rcvs and self.rcvs >= self.max_rcvs: logger.info( "Breaking out after processing %d RCVs (as configured)", self.rcvs) break logger.info("Done parsing elements")