def parse_input_file(TF, connection, input_file, gencode_file, fdr_filter=None, use_variance=None, sample_size=None, only_best_snp=None): genes = {} logging.info("Opening pheno phile") with gzip.open(input_file) as file: for i,line in enumerate(file): if i==0: continue comps = line.strip().split() if fdr_filter: fdr = comps[TF.FDR] if float(fdr) > fdr_filter: snp = comps[TF.SNPName] logging.log(9,"Snp %s doesn't pass fdr filter: %s", snp, fdr) continue gene = comps[TF.HUGO] if "," in gene: multiple_genes = gene.split(",") for g in multiple_genes: row = row_from_comps(g, comps, TF) process_row(g, row, genes, only_best_snp) else: row = row_from_comps(gene, comps, TF) process_row(gene, row, genes, only_best_snp) logging.info("Opening gencode file") class GenCodeCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} def __call__(self, gencode): if gencode.name in self.genes: rows = self.genes[gencode.name] self.selected[gencode.name] = [(row[0], gencode.ensemble_version, row[1], row[2], row[3], row[4]) for row in rows] callback = GenCodeCallback(genes) GencodeFile.parse_gencode_file(gencode_file, callback) genes = callback.selected if use_variance: logging.info("Opening variance file") vars = VarianceFile.load_variance(use_variance) keys = genes.keys() for key in keys: rows = genes[key] new_rows = [] for r in rows: snp = r[0] if not snp in vars: continue v = vars[snp] std = math.sqrt(v/sample_size) new_rows.append([r[0], r[1], r[2], str(float(r[3])*std), r[4], r[5]]) genes[key] = new_rows Utilities.insert_entries(connection, genes)
def run(self): if os.path.exists(self.args.output_folder): logging.info("Output folder already exists, delete it if you want it done again") return os.makedirs(self.args.output_folder) logging.info("Loading variance") variances = VarianceFile.load_variance(self.args.variance_path) logging.info("Processing GWAS") index, results = gather_pb8k(self.args.pb8k_path, variances, self.args.sample_size) logging.info("Writing slices") for gene, rows in results.iteritems(): save_slice(self.args.output_folder, gene, rows)
def __init__(self, variance_file_path, sample_size, TF=TF1): self.TF = TF logging.info("Opening variance file") self.vars = VarianceFile.load_variance(variance_file_path) self.sample_size = sample_size