def create_vcf_for_annotation_all_samples(out_dir): out_vcf = '%s/%s.vcf' % (out_dir, 'all_samples') vcf_header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tALL\n" hotspot_vars = hotspot_mongo.get_hotspot_vars() with open(out_vcf, "w") as out_file: out_file.write(vcf_header) client, db = mongo.get_connection() for var in hotspot_vars: chrom, pos, ref, alt = var['CHROM'], var['POS'], var['REF'], var['ALT'] if not hotspot_mongo.has_annotation(chrom, pos, ref, alt, db): chrom, pos, ref, alt = str(chrom), str(pos), ref, ",".join(alt) gt = './.' if gt is None: gt = './.' list_entry = [chrom, pos, '.', str(ref), str(alt)] variant = list_entry + ['.', '.', '.', 'GT', gt] out_file.write("\t".join([str(val) for val in variant]) + "\n") client.close() return out_vcf
def reconcile_hotspot_and_database(self, hotspot_file): self.project_config = config_mongo.get_project_config() client, db = mongo.get_connection() vcf_reader = vcf.Reader(open(hotspot_file, 'r')) for rec in vcf_reader: chrom, pos, ref, alt = int(rec.CHROM.strip("chr")), int(rec.POS), rec.REF, [str(alt) for alt in rec.ALT] if not hotspot_mongo.is_hotspot(chrom, pos, ref, alt, db): self.__reconcile(chrom, pos, ref, alt, db) client.close()
def sample_variants_csv(self, sample, type): if not sampleinfo_mongo.is_sample(sample) or not variants_mongo.is_sample_loaded(sample, type): self.__log_sample_doesnt_exist() return out_path = "%s/%s.csv" % ( self.output_files_dir, sample) print out_path csv_writer = csv.writer(open(out_path, "w"), delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) header = ['CHROM', 'POS', 'REF', 'ALT', 'GT', 'RSID', 'Gene', 'ExonicFunc', 'AAChange', 'FREQ', 'QC_Final', 'QC_Cov', 'QC_AF', 'In_Hotspot'] csv_writer.writerow(header) client, db = mongo.get_connection() total_loaded_samples = variants_mongo.count_samples() for var in variants_mongo.get_sample_vars(sample, type, db): new_variant = {} chrom, pos, ref, alt = var['CHROM'], var['POS'], var['REF'], var['ALT'] al1, al2 = genotypetools.get_genotype_alleles(ref, alt, var['GT_calc']) new_variant.update({'CHROM': chrom, 'POS': pos, 'REF': ref, 'ALT': ",".join(alt), 'GT': "/".join([al1, al2])}) hotspot = hotspot_mongo.get_variant(chrom, pos, ref, alt, db) annot = hotspot['ANNOTATION'][0] new_variant.update({'RSID': annot['snp137NonFlagged'], 'Gene': annot['Gene_refGene'], 'ExonicFunc': annot['ExonicFunc_refGene'], 'AAChange': annot['AAChange_refGene']}) if 'p.' in new_variant['AAChange']: new_variant['AAChange'] = new_variant['AAChange'].split('p.')[1].split(",")[0] zygosity = hotspot['orig_stats']['zygosity'] freq = sum([zygosity['het_count'], zygosity['het_alt_count'], zygosity['hom_count']]) / float(total_loaded_samples) final_qc, qc_cov, qc_af = var['FINAL_QC'], var['COV_QC'], var['AF_QC'] if hotspot['orig_stats']['qc']['final_qc_count'] > 0: in_hotspot = "TRUE" else: in_hotspot = "FALSE" new_variant.update({"FREQ": freq, "QC_Final": final_qc, "QC_Cov": qc_cov, "QC_AF": qc_af, "In_Hotspot": in_hotspot}) out_row = [str(new_variant[field]) for field in header] csv_writer.writerow(out_row) #print "\t".join(out_row) return out_path
def __load_sample_variants(self, sample, vcf_file): self.__log_loading_new_sample(sample, vcf_file) vcf_reader = vcf.Reader(open(vcf_file, 'r')) client, db = mongo.get_connection() for record in vcf_reader: variant_doc = self.get_variant_doc(record, sample, vcf_file) variants_mongo.add_variant(variant_doc, db) if self.variant_type == 'orig': hotspot_mongo.add_variant(variant_doc, db) elif self.variant_type == 'hotspot': hotspot_mongo.add_hotspot_variant(variant_doc, db) client.close()
def add_sample_info(self, sample_info_file): if not os.path.isfile(sample_info_file): self.__log_invalid_file(sample_info_file) sys.exit(1) else: client, db = mongo.get_connection() with open(sample_info_file, 'r') as infile: header = infile.readline().strip().split() for line in infile: new_sample = {header[i]: line.strip().split()[i] for i in range(len(line.strip().split()))} new_sample.update({"PROJECT": self.project_config['project_name']}) sampleinfo_mongo.add_new_sample(new_sample, db) client.close()
def create_vcf_files(self): num_processors = 10 samples = sampleinfo_mongo.get_samples() client, db = mongo.get_connection() jobs = set() while len(samples) > 0: sample = samples.pop(0) p = Process(target=vcftools.create_vcf_gt_orig_no_qc, args=(sample, self.vtools_dir, db)) jobs.add(p) p.start() if len(jobs) == num_processors: for j in jobs: j.join() jobs.clear() client.close()
def save_annotations(self, annovar_vcf): self.__log_saving_annotations() self.project_config = config_mongo.get_project_config() client, db = mongo.get_connection() with open(annovar_vcf, "r") as annov_in: line = annov_in.readline() if not line.startswith("#CHROM"): while line.startswith('##'): line = annov_in.readline() header = line.strip().strip("#").split("\t") for line in annov_in: chrom, pos, ref, alt, annotations = self.__process_annovar_line(line, header) annotate_mongo.save_annotation(chrom, pos, ref, alt, annotations, db) client.close()
def save_annotations(self, annovar_vcf): """ This will save the annotations of a sample in the database. :param annovar_vcf: :return: """ self.__log_saving_annotations() self.project_config = config_mongo.get_project_config() client, db = mongo.get_connection() with open(annovar_vcf, "r") as annov_in: line = annov_in.readline() if not line.startswith("#CHROM"): while line.startswith('##'): # this reads through the junk lines line = annov_in.readline() header = line.strip().strip("#").split("\t") for line in annov_in: line = line.strip() if line != "": chrom, pos, ref, alt, annotations = self.__process_annovar_line(line, header) annotate_mongo.save_annotation(chrom, pos, ref, alt, annotations, db) else: header = line.strip().strip("#").split("\t") for line in annov_in: line = line.strip() if line != "": chrom, pos, ref, alt, annotations = self.__process_annovar_line(line, header) annotate_mongo.save_annotation(chrom, pos, ref, alt, annotations, db) client.close()
def create_vcf_for_annotation(sample, type, out_dir): sample_vars = variants_mongo.get_sample_vars(sample, type) out_vcf = '%s/%s.vcf' % (out_dir, sample) with open(out_vcf, "w") as out_file: vcf_header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample out_file.write(vcf_header) client, db = mongo.get_connection() for var in sample_vars: chrom, pos, ref, alt = var['CHROM'], var['POS'], var['REF'], var['ALT'] if not hotspot_mongo.has_annotation(chrom, pos, ref, alt, db): chrom, pos, ref, alt = str(chrom), str(pos), ref, ",".join(alt) gt = var['GT_orig'] if gt is None: gt = './.' list_entry = [chrom, pos, '.', str(ref), str(alt)] variant = list_entry + ['.', '.', 'DP=%s' % var['READ_DEPTH'], 'GT', gt] out_file.write("\t".join([str(val) for val in variant]) + "\n") client.close() return out_vcf
def __parallel_process_vcf_files(self, vcf_files, num_processors): client, db = mongo.get_connection() variants_mongo.drop_variants_index(db) hotspot_mongo.index_hotspot(db) jobs = set() while len(vcf_files) > 0: args = vcf_files.pop(0) sample = args[0] vcf_file = args[1] p = Process(target=self.__load_sample_variants, args=(sample, vcf_file)) jobs.add(p) p.start() if len(jobs) == num_processors: for j in jobs: j.join() jobs.clear() variants_mongo.index_variants(db) client.close()
def __get_unsaved_hotspot_vcf_files(self): hotspot_dir = self.project_config['hotspot_dir'] output_dir = hotspot_dir + "/hotspot_output" vcf_files = glob(output_dir+"/*.vcf") final_vcf_files = [] client, db = mongo.get_connection() for vcf_file in vcf_files: sample = os.path.basename(vcf_file).split(".")[0] if sampleinfo_mongo.is_sample(sample, db) and not \ variants_mongo.is_sample_loaded(sample, self.variant_type, db): self.__log_adding_hotspot_sample_to_queue(sample, vcf_file) final_vcf_files.append((sample, vcf_file)) else: self.__log_hotspot_sample_already_loaded(sample) client.close() return final_vcf_files
def load_all(self): if self.variant_type == 'orig': client, db = mongo.get_connection() vcf_files = sampleinfo_mongo.get_vcf_files() # CHECK IF THE VCFS ARE ALL VALID BEFORE STARTING for sample in vcf_files: vcf_file = vcf_files[sample] if not os.path.isfile(vcf_file): self.__log_invalid_vcf_file(vcf_file) sys.exit(1) pending_vcf_files = [] for sample in vcf_files: print sample vcf_file = vcf_files[sample] is_loaded = variants_mongo.is_sample_loaded(sample, self.variant_type, db) if is_loaded: self.__log_sample_already_loaded(sample) continue else: self.__log_adding_sample_to_queue(sample, vcf_file) pending_vcf_files.append((sample, vcf_file)) client.close() elif self.variant_type == 'hotspot': pending_vcf_files = self.__get_unsaved_hotspot_vcf_files() num_processors = 10 self.__parallel_process_vcf_files(pending_vcf_files, num_processors) self.__log_successfully_loaded()