def test_add_second_vcf_variant_set(self): # This VCF only has one Variant which is not in the first VCF vcf = VCF(f="tests/vcf_tests/test2.vcf", reference_set_id=self.reference_set.id, method="CORTEX") vcf.add_to_database() assert VariantSet.objects().count() == 3 assert VariantCallSet.objects().count() == 2 assert VariantCall.objects().count() == 42 assert Variant.objects().count() == 22 assert len(Variant.objects()[0].variant_sets) == 3 assert len( Variant.objects.get( names="UNION_BC_k31_var_147").variant_sets) == 3
def _remove_variant_set(self, variant_set_name): vs = VariantSet.objects.get(name=variant_set_name, reference_set=self.reference_set) for call_set in VariantCallSet.objects(variant_sets=vs): call_set.variant_sets.remove(vs) call_set.save() # Remove calls from callsets that only have this variantset if len(call_set.variant_sets) < 2: VariantCall.objects(call_set=call_set).delete() call_set.delete() # Remove variants that are ONLY from this variant set Variant.objects(variant_sets=vs, variant_sets__size=2).delete() VariantSetMetadata.objects(variant_set=vs).delete() vs.delete()
def test_add_add_variants_and_calls(self): vcf = VCF(f="tests/vcf_tests/test.vcf", reference_set_id=self.reference_set.id, method="CORTEX") vcf.add_to_database() assert VariantCall.objects().count() == 21 assert Variant.objects().count() == 21
def get_context(pos, kmer): context = [] for variant in Variant.objects(start__ne=pos, start__gt=pos - kmer, start__lt=pos + kmer): for split_variant in variant.split(): context.append(split_variant) return context
def test_add_new_vcf_variant_set(self): vcf = VCF(f="tests/vcf_tests/test.vcf", reference_set_id=self.reference_set.id, method="CORTEX") vcf.add_to_database() # We create a global variant set as well as one for the individual VCF assert VariantSet.objects().count() == 2 vs = VariantSet.objects()[0] assert len(Variant.objects()[0].variant_sets) == 2 assert vs.name == "test.vcf"
def test_add_second_vcf_variant_set(self): # This VCF only has one Variant which is not in the first VCF vcf = VCF(f="tests/vcf_tests/test3.vcf", reference_set_id=self.reference_set.id, method="CORTEX") vcf.add_to_database() assert VariantSet.objects().count() == 2 assert VariantCallSet.objects().count() == 1 assert VariantCall.objects().count() == 106 assert Variant.objects().count() == 106 assert Variant.snps().count() == 89 assert Variant.indels().count() == 17 assert Variant.insertions().count() == 8 assert Variant.deletions().count() == 8 assert Variant.ph_snps.count() == 1
def run(parser, args): DB = connect('mykrobe-%s' % (args.db_name)) if DB is not None: try: Variant.objects() logging.info( "Connected to mykrobe-%s" % (args.db_name)) except (ServerSelectionTimeoutError): DB = None logging.warning( "Could not connect to database. Continuing without using genetic backgrounds") mutations = [] reference = os.path.basename(args.reference_filepath).split('.fa')[0] if args.vcf: run_make_probes_from_vcf_file(args) elif args.genbank: aa2dna = GeneAminoAcidChangeToDNAVariants( args.reference_filepath, args.genbank) if args.text_file: with open(args.text_file, 'r') as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene, mutation_string, alphabet = row if alphabet == "DNA": protein_coding_var = False else: protein_coding_var = True for var_name in aa2dna.get_variant_names( gene, mutation_string, protein_coding_var): mutation = Mutation(reference=reference, var_name=var_name, gene=aa2dna.get_gene(gene), mut=mutation_string) mutations.append(mutation) else: for variant in args.variants: gene, mutation = variant.split("_") for var_name in aa2dna.get_variant_names(gene, mutation): mutations.append( Mutation(reference=reference, var_name=var_name, gene=gene, mut=mutation)) else: if args.text_file: with open(args.text_file, 'r') as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene_name, pos, ref, alt, alphabet = row if gene_name == "ref": mutations.append( Mutation( reference=reference, var_name="".join([ref, pos, alt]))) else: mutations.append( Mutation( reference=reference, var_name=row[0])) else: mutations.extend(Mutation(reference=reference, var_name=v) for v in args.variants) al = AlleleGenerator( reference_filepath=args.reference_filepath, kmer=args.kmer) for enum, mut in enumerate(mutations): if enum % 100 == 0: logger.info( "%i of %i - %f%%" % (enum, len(mutations), round(100*enum/len(mutations), 2))) variant_panel = make_variant_probe( al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds) if variant_panel is not None: for i, ref in enumerate(variant_panel.refs): try: gene_name = mut.gene.name except AttributeError: gene_name = "NA" sys.stdout.write( ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" % (mut.mut, mut.variant.var_name, len( variant_panel.alts), mut.reference, i, gene_name, mut.mut)) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write(">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % (mut.mut, mut.variant.var_name, i, gene_name, mut.mut)) sys.stdout.write("%s\n" % a) else: logging.warning( "All variants failed for %s_%s - %s" % (mut.gene, mut.mut, mut.variant))
def run(parser, args): # There's no need to try to connect to database if we're not doing backgrounds if args.no_backgrounds: logger.info( "Not connecting to database, because --no-backgrounds option used") DB = None else: DB = connect("%s-%s" % (DB_PREFIX, args.db_name)) if DB is not None: try: Variant.objects() logger.info("Connected to %s-%s" % (DB_PREFIX, args.db_name)) except (ServerSelectionTimeoutError): DB = None logger.warning( "Could not connect to database. Continuing without using genetic backgrounds" ) mutations = [] lineages = set() reference = os.path.basename(args.reference_filepath).split(".fa")[0] if args.vcf: run_make_probes_from_vcf_file(args) elif args.genbank: aa2dna = GeneAminoAcidChangeToDNAVariants(args.reference_filepath, args.genbank) if args.text_file: with open(args.text_file, "r") as infile: reader = csv.reader(infile, delimiter="\t") for row in reader: gene, mutation_string, alphabet = row if alphabet == "DNA": protein_coding_var = False else: protein_coding_var = True for var_name in aa2dna.get_variant_names( gene, mutation_string, protein_coding_var): mutation = Mutation( reference=reference, var_name=var_name, gene=aa2dna.get_gene(gene), mut=mutation_string, protein_coding_var=protein_coding_var, ) mutations.append(mutation) else: for variant in args.variants: gene, mutation = variant.split("_") for var_name in aa2dna.get_variant_names(gene, mutation): mutations.append( Mutation( reference=reference, var_name=var_name, gene=gene, mut=mutation, )) else: if args.text_file: mutations, lineages = load_dna_vars_txt_file( args.text_file, reference) if args.lineage: with open(args.lineage, "w") as f: json.dump(lineages, f, sort_keys=True, indent=2) else: mutations.extend( Mutation(reference=reference, var_name=v) for v in args.variants) al = AlleleGenerator(reference_filepath=args.reference_filepath, kmer=args.kmer) for enum, mut in enumerate(mutations): if enum % 100 == 0: logger.info( "%i of %i - %f%%" % (enum, len(mutations), round(100 * enum / len(mutations), 2))) variant_panel = make_variant_probe(al, mut.variant, args.kmer, DB=DB, no_backgrounds=args.no_backgrounds) if variant_panel is not None: for i, ref in enumerate(variant_panel.refs): try: gene_name = mut.gene.name except AttributeError: gene_name = "NA" sys.stdout.write( ">ref-%s?var_name=%s&num_alts=%i&ref=%s&enum=%i&gene=%s&mut=%s\n" % ( mut.mutation_output_name, mut.variant.var_name, len(variant_panel.alts), mut.reference, i, gene_name, mut.mutation_output_name, )) sys.stdout.write("%s\n" % ref) for i, a in enumerate(variant_panel.alts): sys.stdout.write( ">alt-%s?var_name=%s&enum=%i&gene=%s&mut=%s\n" % ( mut.mutation_output_name, mut.variant.var_name, i, gene_name, mut.mutation_output_name, )) sys.stdout.write("%s\n" % a) else: logger.warning("All variants failed for %s_%s - %s" % (mut.gene, mut.mutation_output_name, mut.variant))