Exemplo n.º 1
0
def parse_input_file(TF, connection, input_file, gencode_file, fdr_filter=None, use_variance=None, sample_size=None, only_best_snp=None):
    genes = {}
    logging.info("Opening pheno phile")
    with gzip.open(input_file) as file:
        for i,line in enumerate(file):
            if i==0:
                continue

            comps = line.strip().split()

            if fdr_filter:
                fdr = comps[TF.FDR]
                if float(fdr) > fdr_filter:
                    snp = comps[TF.SNPName]
                    logging.log(9,"Snp %s doesn't pass fdr filter: %s", snp, fdr)
                    continue

            gene = comps[TF.HUGO]
            if "," in gene:
                multiple_genes = gene.split(",")
                for g in multiple_genes:
                    row = row_from_comps(g, comps, TF)
                    process_row(g, row, genes, only_best_snp)
            else:
                row = row_from_comps(gene, comps, TF)
                process_row(gene, row, genes, only_best_snp)

    logging.info("Opening gencode file")

    class GenCodeCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}

        def __call__(self, gencode):
            if gencode.name in self.genes:
                rows = self.genes[gencode.name]
                self.selected[gencode.name] = [(row[0], gencode.ensemble_version, row[1], row[2], row[3], row[4]) for row in rows]
    callback = GenCodeCallback(genes)
    GencodeFile.parse_gencode_file(gencode_file, callback)
    genes = callback.selected

    if use_variance:
        logging.info("Opening variance file")
        vars = VarianceFile.load_variance(use_variance)
        keys = genes.keys()
        for key in keys:
            rows = genes[key]
            new_rows = []
            for r in rows:
                snp = r[0]
                if not snp in vars:
                    continue
                v = vars[snp]
                std = math.sqrt(v/sample_size)
                new_rows.append([r[0], r[1], r[2], str(float(r[3])*std), r[4], r[5]])
            genes[key] = new_rows

    Utilities.insert_entries(connection, genes)
Exemplo n.º 2
0
    def run(self):
        if os.path.exists(self.args.output_folder):
            logging.info("Output folder already exists, delete it if you want it done again")
            return

        os.makedirs(self.args.output_folder)

        logging.info("Loading variance")
        variances = VarianceFile.load_variance(self.args.variance_path)

        logging.info("Processing GWAS")
        index, results = gather_pb8k(self.args.pb8k_path, variances, self.args.sample_size)

        logging.info("Writing slices")
        for gene, rows in results.iteritems():
            save_slice(self.args.output_folder, gene, rows)
Exemplo n.º 3
0
 def __init__(self, variance_file_path, sample_size, TF=TF1):
     self.TF = TF
     logging.info("Opening variance file")
     self.vars = VarianceFile.load_variance(variance_file_path)
     self.sample_size = sample_size