def execute(args): with open(args.config, "r") as stream: try: config = yaml.safe_load(stream) except yaml.YAMLError as e: print(e, file=stderr) exit(1) # load annotation data ann_data = np.genfromtxt( config["annotation"]["file"], delimiter=config["annotation"].get("delimiter", "\t"), names=True, dtype=None, encoding=None, ) # ann_data = pd.read_csv(config["annotation"]["file"], sep="\t", header=0) # ann_data = dict(tuple(ann_data.groupby("chrom"))) # build expression expression = ",".join( f'{value["expression"]}' for value in map(lambda x: x["value"], config["annotation"]["values"]) ) expression = f"({expression})" with VariantFile(args.vcf) as vcf: # add new info for value in config["annotation"]["values"]: value = value["value"] vcf.header.add_meta( "INFO", items=[ ("ID", value["vcf_name"]), ("Number", value["number"]), ("Type", value["type"]), ("Description", value["description"]), ], ) fmt = {"vcf": "", "bcf": "b", "uncompressed-bcf": "u"}[args.output_fmt] with VariantFile( args.output, f"w{fmt}", header=vcf.header, ) as out: variants = annotate_vcf( vcf, expression, args.annotation_key, ann_data=ann_data, config=config, ) for v in variants: out.write(v)
def filter_vcf( vcf: VariantFile, expression: str, ann_key: str, keep_unmatched: bool = False, preserve_order: bool = False, auxiliary: Dict[str, Set[str]] = {}, overwrite_number: Dict[str, str] = {}, ) -> Iterator[VariantRecord]: env = Environment(expression, ann_key, vcf.header, auxiliary, overwrite_number) events = set() info_keys = set(vcf.header.info.keys()) record: VariantRecord for idx, record in enumerate(vcf): record, record_has_passed = test_and_update_record( env, idx, record, ann_key, keep_unmatched) if record_has_passed: is_bnd = "SVTYPE" in info_keys and record.info.get("SVTYPE", None) == "BND" if is_bnd: event = record.info.get("EVENT", None) events.add(event) elif not preserve_order: # if preserve_order is True, \ # we will output everything in the second pass instead yield record if len(events) > 0: # perform a second pass if the first pass filtered breakend (BND) events # since these are compound events which have to be filtered jointly vcf.reset() for idx, record in enumerate(vcf): is_bnd = "SVTYPE" in info_keys and record.info.get("SVTYPE", None) == "BND" event = record.info.get("EVENT", None) if is_bnd: if event not in events: # only bnds with a valid associated event need to be considered, \ # so skip the rest continue else: if not preserve_order: continue record, _ = test_and_update_record(env, idx, record, ann_key, keep_unmatched) yield record
def execute(args): aux = read_auxiliary(args.aux) with VariantFile(args.vcf) as vcf: header: VariantHeader = vcf.header header.add_meta("vembraneVersion", __version__) # NOTE: If .modules.filter.execute might be used as a library function # in the future, we should not record sys.argv directly below. header.add_meta( "vembraneCmd", "vembrane " + " ".join("'" + arg.replace("'", '"') + '"' if " " in arg else arg for arg in sys.argv[1:]), ) records = filter_vcf( vcf, args.expression, args.annotation_key, keep_unmatched=args.keep_unmatched, preserve_order=args.preserve_order, auxiliary=aux, overwrite_number=args.overwrite_number, ) try: first_record = list(islice(records, 1)) except VembraneError as ve: print(ve, file=stderr) exit(1) records = chain(first_record, records) if args.statistics is not None: records = statistics(records, vcf, args.statistics, args.annotation_key) fmt = {"vcf": "", "bcf": "b", "uncompressed-bcf": "u"}[args.output_fmt] with VariantFile( args.output, f"w{fmt}", header=header, ) as out: try: for record in records: out.write(record) except VembraneError as ve: print(ve, file=stderr) exit(1)
def remove_invalid_snv_ids(vcf_file): vcf = VariantFile(vcf_file) vcf_filename = basename(vcf_file) with TemporaryDirectory(dir=".") as wdir: temp_file = join(wdir, vcf_filename) with VariantFile(temp_file, 'w', header=vcf.header) as out: for line in vcf: snv_id = line.id if snv_id is not None and "_" in snv_id: warning("Invalid SNV id found will be filtered out: %s" % snv_id) continue out.write(line) safe_rename(temp_file, vcf_file)
def save_samples_to_db(self): from vcf_uploading.types import SamplesDict from vcf_uploading.utils import are_samples_empty, parse_samples, save_record_to_db with transaction.atomic(): self.saved = True self.save() logger.info("Trying to read VCF file with pysam") vcf: VariantFile = VariantFile(self.file.path) with transaction.atomic(): first_iteration = True for i, record in enumerate(vcf.fetch()): if i % 100 == 1: logger.debug("{} records processed", i) if first_iteration: if are_samples_empty(record): break samples: Optional[SamplesDict] = parse_samples( record, self) if not samples: logger.info("No new samples detected. Breaking") break first_iteration = False save_record_to_db(record=record, samples=samples) logger.info("File is saved to the database") logger.debug("File.saved: {}", self.saved)
def get_samples(self) -> List[str]: vcf_file_path = Path(self.file.path) pysam_vcf: VariantFile = VariantFile(vcf_file_path) record = next(pysam_vcf.fetch()) samples: List[str] = list(record.samples.keys()) return samples
def calculate_statistics(self): """Calculate statistics of VCF file The following statistics are calculated 1. Number of samples in file 2. Number of REF matches 3. Number of ALTs 4. Number of missing genotypes :return samples_statistics: Dict[str, SampleStatistics]. Keys of the dictionary are samples' names. Values are dictionaries with keys: * n_refs: int — number of alleles that are identical to a reference * n_alts: int — number of alleles that are not identical to a reference * n_missing: int — number of alleles with unknown genotype """ from vcf_uploading.types import SampleStatistics logger.info("Trying to read VCF file with pysam") vcf: VariantFile = VariantFile(self.file.path) self.n_refs = 0 self.n_missing_genotypes = 0 self.n_alts = 0 self.n_samples = 0 self.n_records = 0 samples_statistics: Dict[str, SampleStatistics] = {} for i, record in enumerate(vcf.fetch()): for sample in record.samples: indices: Tuple[int] = record.samples[ sample].allele_indices # e.g. (0, 1) n_refs = indices.count(0) n_missing = indices.count(None) n_alts = 2 - n_refs - n_missing self.n_refs += n_refs self.n_missing_genotypes += n_missing self.n_alts += n_alts if sample in samples_statistics: # TODO: can we make it a defaultdict? samples_statistics[sample]["n_refs"] += n_refs samples_statistics[sample]["n_alts"] += n_alts samples_statistics[sample]["n_missing"] += n_missing else: samples_statistics[sample] = {} samples_statistics[sample]["n_refs"] = 0 samples_statistics[sample]["n_alts"] = 0 samples_statistics[sample]["n_missing"] = 0 if "record" in locals(): # Cycle executed at least once self.n_samples = len(record.samples) self.n_records = i + 1 self.save() return samples_statistics
def find_similar_samples_in_db(self): from .utils import get_average_similarities logger.info("Trying to find similar samples in the DB for file {}", self.file.name) similar_samples: Dict[str, Dict[str, List[float]]] = {} samples = self.get_samples() for sample in samples: similar_samples[sample] = defaultdict(list) vcf: VariantFile = VariantFile(self.file.path) # TODO: this can be done MUCH faster # We can check for a few tens of SNPs from file. If a database # sample has different genotypes in most of them, we can exclude it # from the further checks for i, record in enumerate(vcf): if i % 100 == 1: logger.info("{} records processed", i) snp = SNP.from_record(record) for sample_name, sample in record.samples.items(): if snp is None: for db_sample in Sample.objects.all(): similar_samples[sample_name][db_sample.cypher].append( 0) else: if all(allele is None for allele in sample.alleles): continue db_samples_similarity = SNP.calculate_similarity_to_each_sample( snp, sample.alleles) for db_sample, similarity in db_samples_similarity.items(): similar_samples[sample_name][db_sample].append( similarity) similarities: Dict[str, Dict[str, float]] = get_average_similarities( similar_samples) return similarities
def predict_nationality(self) -> Dict[str, Dict[str, float]]: """Predict nationalities for each sample in `self.file` :return samples_nationalities: Dict[str, Dict[str, float]] - a dictionary, where the keys are the samples, and the values are the prediction of nationalities. In the predictions, keys are nationalities, and values are their probabilities """ logger.info("Predicting nationality for RawVCF") predictions = {} for sample in self.get_samples(): logger.info("Predicting nationality for sample {}", sample) sample_vcf: VariantFile = VariantFile(self.file.path) sample_vcf.subset_samples([sample]) predictor = FastNGSAdmixPredictor(sample_vcf) predictions[sample] = predictor.predict() logger.info("Returning nationality predictions") logger.debug("Predictions: {}", predictions) return predictions