def populate_from_vcf(self): """ """ import gemini_annotate # avoid circular dependencies self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 self.skipped = 0 extra_file, extraheader_file = gemini_annotate.get_extra_files(self.args.db) extra_headers = {} with open(extra_file, "w") as extra_handle: # process and load each variant in the VCF file for var in self.vcf_reader: if self.args.passonly and (var.FILTER is not None and var.FILTER != "."): self.skipped += 1 continue (variant, variant_impacts, extra_fields) = self._prepare_variation(var) if extra_fields: extra_handle.write("%s\n" % json.dumps(extra_fields)) extra_headers = self._update_extra_headers(extra_headers, extra_fields) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) buffer_count += 1 # buffer full - time to insert into DB if buffer_count >= self.buffer_size: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 self.v_id += 1 self.counter += 1 if extra_headers: with open(extraheader_file, "w") as out_handle: out_handle.write(json.dumps(extra_headers)) else: os.remove(extra_file) # final load to the database self.v_id -= 1 database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") if self.args.passonly: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.skipped) + " skipped due to having the " "FILTER field set.\n")
def populate_from_vcf(self): """ """ self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] self.skipped = 0 # need to keep the objects in memory since we just borrow it in python. obj_buffer = [] # process and load each variant in the VCF file for var in self.vcf_reader: if not var.ALT or len(var.ALT) == 0: continue if len(var.ALT) > 1 and not self.seen_multi: self._multiple_alts_message() if self.args.passonly and (var.FILTER is not None and var.FILTER != "."): self.skipped += 1 continue (variant, variant_impacts, extra_fields) = self._prepare_variation(var) variant.extend( extra_fields.get(e) for e in self._extra_effect_fields) obj_buffer.append(var) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) # buffer full - time to insert into DB if len(self.var_buffer) >= self.buffer_size: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch obj_buffer = [] self.var_buffer = [] self.var_impacts_buffer = [] self.v_id += 1 self.counter += 1 # final load to the database self.v_id -= 1 database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") if self.args.passonly: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.skipped) + " skipped due to having the " "FILTER field set.\n")
def populate_from_vcf(self): """ """ self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] self.skipped = 0 # need to keep the objects in memory since we just borrow it in python. obj_buffer = [] # process and load each variant in the VCF file for var in self.vcf_reader: if not var.ALT or len(var.ALT) == 0: continue if len(var.ALT) > 1 and not self.seen_multi: self._multiple_alts_message() if self.args.passonly and (var.FILTER is not None and var.FILTER != "."): self.skipped += 1 continue (variant, variant_impacts, extra_fields) = self._prepare_variation(var) variant.extend(extra_fields.get(e) for e in self._extra_effect_fields) obj_buffer.append(var) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) # buffer full - time to insert into DB if len(self.var_buffer) >= self.buffer_size: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch obj_buffer = [] self.var_buffer = [] self.var_impacts_buffer = [] self.v_id += 1 self.counter += 1 # final load to the database self.v_id -= 1 database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") if self.args.passonly: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.skipped) + " skipped due to having the " "FILTER field set.\n")
def populate_from_vcf(self): """ """ self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 self.skipped = 0 # process and load each variant in the VCF file for var in self.vcf_reader: if self.args.passonly and (var.FILTER is not None and var.FILTER != "."): self.skipped += 1 continue (variant, variant_impacts) = self._prepare_variation(var) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) buffer_count += 1 # buffer full - time to insert into DB if buffer_count >= self.buffer_size: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 self.v_id += 1 self.counter += 1 # final load to the database self.v_id -= 1 database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") if self.args.passonly: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.skipped) + " skipped due to having the " "FILTER field set.\n")
def populate_from_vcf(self): """ """ self.v_id = 1 self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 num_samples = len(self.samples) # process and load each variant in the VCF file for var in self.vcf_reader: (variant, variant_impacts) = self._prepare_variation(var) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) # only infer genotypes if requested if not self.args.noload_genotypes and not self.args.no_genotypes: pass buffer_count += 1 # buffer full - time to insert into DB if buffer_count >= self.buffer_size: sys.stderr.write(str(self.v_id) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, \ self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 self.v_id += 1 # final load to the database database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write(str(self.v_id) + " variants processed.\n")
def populate_from_vcf(self): """ """ self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 # process and load each variant in the VCF file for var in self.vcf_reader: (variant, variant_impacts) = self._prepare_variation(var) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) buffer_count += 1 # buffer full - time to insert into DB if buffer_count >= self.buffer_size: sys.stderr.write(str(self.counter) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch self.var_buffer = [] self.var_impacts_buffer = [] buffer_count = 0 self.v_id += 1 self.counter += 1 # final load to the database self.v_id -= 1 database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write(str(self.counter) + " variants processed.\n")
def populate_from_vcf(self): """ """ import gemini_annotate as ga extra_vcf_fields = set() self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] self.skipped = 0 # we save the vcf in this chunk for extra annotations. self.extra_vcf_writer = ga.get_extra_vcf(self.args.db, self.vcf_reader, tempdir=self.args.tempdir) # process and load each variant in the VCF file for var in self.vcf_reader: if len(var.ALT) > 1 and not self.seen_multi: self._multiple_alts_message() if self.args.passonly and (var.FILTER is not None and var.FILTER != "."): self.skipped += 1 continue (variant, variant_impacts, extra_fields) = self._prepare_variation(var) if extra_fields: var.INFO.update(extra_fields) self.extra_vcf_writer.write_record(var) extra_vcf_fields.update(extra_fields.keys()) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) for var_impact in variant_impacts: self.var_impacts_buffer.append(var_impact) # buffer full - time to insert into DB if len(self.var_buffer) >= self.buffer_size: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch self.var_buffer = [] self.var_impacts_buffer = [] self.v_id += 1 self.counter += 1 # final load to the database self.v_id -= 1 database.insert_variation(self.c, self.var_buffer) database.insert_variation_impacts(self.c, self.var_impacts_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") if self.args.passonly: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.skipped) + " skipped due to having the " "FILTER field set.\n") self.extra_vcf_writer.stream.close() if len(extra_vcf_fields) == 0: os.unlink(self.extra_vcf_writer.stream.name) else: with open(self.extra_vcf_writer.stream.name + ".fields", "w") as o: o.write("\n".join(list(extra_vcf_fields)))
def populate_from_vcf(self): """ """ self.v_id = self._get_vid() self.counter = 0 self.var_buffer = [] self.var_impacts_buffer = [] self.skipped = 0 # need to keep the objects in memory since we just borrow it in python. obj_buffer = [] reader = self.vcf_reader anno_keys = {} if self.args.anno_type in ("snpEff", "all"): if "ANN" in reader: desc = reader["ANN"]["Description"] parts = [x.strip("\"'") for x in re.split("\s*\|\s*", desc.split(":", 1)[1].strip('" '))] anno_keys["ANN"] = parts elif "EFF" in reader: parts = [x.strip(" [])'(\"") for x in re.split("\||\(", reader["EFF"]["Description"].split(":", 1)[1].strip())] anno_keys["EFF"] = parts else: print "snpEff header not found" if self.args.anno_type in ("VEP", "all"): if "CSQ" in reader: parts = [x.strip(" [])'(\"") for x in re.split("\||\(", reader["CSQ"]["Description"].split(":", 1)[1].strip())] anno_keys["CSQ"] = parts # process and load each variant in the VCF file for var in self.vcf_reader: if not var.ALT or len(var.ALT) == 0: continue if len(var.ALT) > 1 and not self.seen_multi: self._multiple_alts_message() if self.args.passonly and (var.FILTER is not None and var.FILTER != "."): self.skipped += 1 continue (variant, variant_impacts, extra_fields) = self._prepare_variation(var, anno_keys) variant.update(extra_fields) [v_.update(extra_fields) for v_ in variant_impacts] obj_buffer.append(var) # add the core variant info to the variant buffer self.var_buffer.append(variant) # add each of the impact for this variant (1 per gene/transcript) self.var_impacts_buffer.extend(variant_impacts) # buffer full - time to insert into DB if len(self.var_buffer) >= self.buffer_size: database.insert_variation(self.c, self.metadata, self.var_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") database.insert_variation_impacts(self.c, self.metadata, self.var_impacts_buffer) # binary.genotypes.append(var_buffer) # reset for the next batch obj_buffer = [] self.var_buffer = [] self.var_impacts_buffer = [] self.v_id += 1 self.counter += 1 # final load to the database self.v_id -= 1 if self.var_buffer: database.insert_variation(self.c, self.metadata, self.var_buffer) database.insert_variation_impacts(self.c, self.metadata, self.var_impacts_buffer) sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.counter) + " variants processed.\n") if self.args.passonly: sys.stderr.write("pid " + str(os.getpid()) + ": " + str(self.skipped) + " skipped due to having the " "FILTER field set.\n")