def run(args): class Callback(object): def __init__(self): self.gene_type = {} self.transcript_type = {} def __call__(self, gencode): if not gencode.gene_type in self.gene_type: self.gene_type[gencode.gene_type] = {"count":0} self.gene_type[gencode.gene_type]["count"] = self.gene_type[gencode.gene_type]["count"] + 1 if not gencode.transcript_type in self.transcript_type: self.transcript_type[gencode.transcript_type] = {"count":0} self.transcript_type[gencode.transcript_type]["count"] = self.transcript_type[gencode.transcript_type]["count"] + 1 if os.path.exists(args.results_file): logging.info("Output file already exists.") exit(0) input_lines = read_input_table(args.input_table) callback = Callback() GencodeFile.parse_gencode_file(args.gencode_file, callback, only_genes=False) folder = os.path.split(args.results_file)[0] if len(folder) and not os.path.exists(folder): os.makedirs(folder) save_results(args.results_file, callback, input_lines)
def parse_input_file(db_output_path, pheno_input_path, gencode_input_path, pb8k_callback): logging.info("Opening PB8K pheno file") file_iterator = Utilities.CSVFileIterator(pheno_input_path, delimiter="\t", header=TF1.HEADER, compressed=True) file_iterator.iterate(pb8k_callback) logging.info("Opening gencode file") def fixed_row(gencode, row): F = Utilities.WDBIF return (row[F.SNP], gencode.ensemble_version, row[F.GENE_NAME], row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE]) class FixGeneCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} def __call__(self, gencode): if gencode.name in self.genes: rows = self.genes[gencode.name] self.selected[gencode.name] = [fixed_row(gencode, row) for k,row in rows.iteritems()] gencode_callback = FixGeneCallback(pb8k_callback.genes) GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback) genes = gencode_callback.selected logging.info("Fixing rows") genes = fix_row(genes) logging.info("Saving entries") connection = Utilities.connect(db_output_path) Utilities.setup_db(connection) Utilities.insert_entries(connection, genes)
def parse_input_file(TF, connection, input_file, gencode_file, fdr_filter=None, use_variance=None, sample_size=None, only_best_snp=None): genes = {} logging.info("Opening pheno phile") with gzip.open(input_file) as file: for i,line in enumerate(file): if i==0: continue comps = line.strip().split() if fdr_filter: fdr = comps[TF.FDR] if float(fdr) > fdr_filter: snp = comps[TF.SNPName] logging.log(9,"Snp %s doesn't pass fdr filter: %s", snp, fdr) continue gene = comps[TF.HUGO] if "," in gene: multiple_genes = gene.split(",") for g in multiple_genes: row = row_from_comps(g, comps, TF) process_row(g, row, genes, only_best_snp) else: row = row_from_comps(gene, comps, TF) process_row(gene, row, genes, only_best_snp) logging.info("Opening gencode file") class GenCodeCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} def __call__(self, gencode): if gencode.name in self.genes: rows = self.genes[gencode.name] self.selected[gencode.name] = [(row[0], gencode.ensemble_version, row[1], row[2], row[3], row[4]) for row in rows] callback = GenCodeCallback(genes) GencodeFile.parse_gencode_file(gencode_file, callback) genes = callback.selected if use_variance: logging.info("Opening variance file") vars = VarianceFile.load_variance(use_variance) keys = genes.keys() for key in keys: rows = genes[key] new_rows = [] for r in rows: snp = r[0] if not snp in vars: continue v = vars[snp] std = math.sqrt(v/sample_size) new_rows.append([r[0], r[1], r[2], str(float(r[3])*std), r[4], r[5]]) genes[key] = new_rows Utilities.insert_entries(connection, genes)
def run(self): if os.path.exists(self.args.output): logging.info("%s already exists. Delete it if you want it done again", self.args.output) return logging.info("Loading %s", self.args.weight_db) weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db) logging.info("Loaded %s genes", len(weight_db_logic.gene_data_for_gene)) logging.info("Building snp dict from %s", self.args.gtex_snp) snp_dict = GTExSNPFile.build_snp_dict(self.args.gtex_snp, weight_db_logic) logging.info("Got %d snps in dictionary", len(snp_dict)) logging.info("Building gene expression") gene_expression, individuals = GTExGenoFile.build_gene_expression(self.args.gtex_geno, weight_db_logic, snp_dict) logging.info("Loaded %d gene expression", len(gene_expression)) if self.args.gencode_file: logging.info("Translating gene names to ensemble id") ensemble_to_name, name_to_ensemble = GencodeFile.ensemble_to_name_relationships(self.args.gencode_file) logging.info("Loaded %d (%d) names", len(ensemble_to_name), len(name_to_ensemble)) keys = gene_expression.keys() for k in keys: expression = gene_expression[k] if k in ensemble_to_name: pass elif k in name_to_ensemble: del gene_expression[k] ensemble_id = name_to_ensemble[k] gene_expression[ensemble_id] = expression else: del gene_expression[k] logging.info("Saving gene expression for %d genes", len(gene_expression)) save_expression(self.args.output, gene_expression, individuals)
def parse_folder(folder, db, gencode_file): contents = os.listdir(folder) logging.info("Processing gencode file") class GencodeCallback(object): def __init__(self, contents): self.contents = {gene:True for gene in contents} self.genes = {} def __call__(self, gencode): if gencode.name in self.contents: self.genes[gencode.name] = gencode.ensemble_version callback = GencodeCallback(contents) GencodeFile.parse_gencode_file(gencode_file, callback) gene_names = callback.genes logging.info("processing folder") genes = {} for content in contents: if not content in gene_names: logging.log(9, "Gene %s not in gencode", content) continue sub_path = TWASFormat.build_subpaths(folder, content) map_path = sub_path + ".wgt.map" snps = TWASFormat.load_map(map_path) weights = TWASFormat.build_weights(sub_path) rows = [] gene_id = gene_names[content] for i, snp in enumerate(snps): w = weights[i] row = (snp[TWASFormat.MTF.snp], gene_id, content, w, snp[TWASFormat.MTF.a1], snp[TWASFormat.MTF.a2] ) rows.append(row) genes[content] = rows Utilities.insert_entries(db, genes)
def parse_input_file(db_output_path, gtex_pheno_input_path, gencode_input_path, gtex_snp_path, gtex_callback): logging.info("Opening GTEx pheno file %s", os.path.basename(os.path.normpath(gtex_pheno_input_path))) file_iterator = Utilities.CSVFileIterator(gtex_pheno_input_path, delimiter="\t", header=GTEXEQTLF.HEADER, compressed=True) file_iterator.iterate(gtex_callback) logging.info("%d found at GTEx file", len(gtex_callback.genes)) logging.info("Opening gencode file") def gencode_fixed_row(gencode, row): F = Utilities.WDBIF return (row[F.SNP], row[F.GENE], gencode.gene_name, row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE]) class FixGeneCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} def __call__(self, gencode): if gencode.gene_id in self.genes: rows = self.genes[gencode.gene_id] self.selected[gencode.gene_id] = {k:gencode_fixed_row(gencode, row) for k,row in rows.iteritems()} gencode_callback = FixGeneCallback(gtex_callback.genes) GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback) genes = gencode_callback.selected logging.info("%d survived after gencode file", len(genes)) pvalues = gtex_callback.pvalues del gencode_callback del gtex_callback logging.info("Opening GTEX Snp file") def snp_fixed_row(row, rsid, ref_allele, eff_allele): F = Utilities.WDBIF return (rsid, row[F.GENE], row[F.GENE_NAME], ref_allele, eff_allele, row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE]) class FixSNPCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} self.variant_to_gene = {} for gene, rows in self.genes.iteritems(): for variant, row in rows.iteritems(): self.variant_to_gene[variant] = gene def __call__(self, i, comps): if i == 0: return F = GTExSNPFile.SNPTF variant = comps[F.VariantID] ref = comps[F.Ref_b37] eff = comps[F.Alt] snp = comps[F.RS_ID_dbSNP142_CHG37p13] if variant in self.variant_to_gene: gene = self.variant_to_gene[variant] row = self.genes[gene][variant] if not gene in self.selected: self.selected[gene] = {} self.selected[gene][snp] = snp_fixed_row(row, snp, ref, eff) snp_callback = FixSNPCallback(genes) snp_iterator = Utilities.CSVFileIterator(gtex_snp_path, delimiter="\t", compressed=True) #header in the file is just wrong., header=GTExSNPFile.SNPTF.HEADER, compressed=True) snp_iterator.iterate(snp_callback) genes = snp_callback.selected logging.info("%d survived after snp file", len(genes)) del snp_callback logging.info("Fixing rows") fix_rows(genes, pvalues) logging.info("Saving entries") connection = Utilities.connect(db_output_path) Utilities.setup_db(connection) Utilities.insert_entries(connection, genes)