def testWeightDBEntryLogic(self):
        weight_db_entry_logic = WeightDBUtilities.WeightDBEntryLogic(
            "tests/_td/test.db")

        expected_weights = expected_weights_results()
        expected_extra = expected_extra_results()

        self.assertEqual(len(weight_db_entry_logic.weights_by_gene),
                         len(expected_extra))
        self.assertEqual(len(weight_db_entry_logic.gene_data_for_gene),
                         len(expected_extra))

        for e in expected_extra:
            self.assertTrue(e.gene in weight_db_entry_logic.weights_by_gene)
            self.assertTrue(e.gene in weight_db_entry_logic.gene_data_for_gene)

            actual_gene_data = weight_db_entry_logic.gene_data_for_gene[e.gene]
            self.assertExtra([actual_gene_data], [e])

            actual_weights = [
                w for k, w in weight_db_entry_logic.weights_by_gene[
                    e.gene].items()
            ]
            e_w = [w for w in expected_weights if w.gene == e.gene]
            self.assertWeights(actual_weights, e_w)

        self.assertEqual(len(weight_db_entry_logic.genes_for_an_rsid), 6)
        for rsid, genes in weight_db_entry_logic.genes_for_an_rsid.items():
            expected = [w.gene for w in expected_weights if w.rsid == rsid]
            self.assertEqual(expected, genes)
Exemplo n.º 2
0
    def run(self):
        if self.args.weight_db_path:
            logging.info("Loading weight model")
            weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
                self.weight_db_path)
        else:
            weight_db_logic = None

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        if len(names) == 0:
            raise Exceptions.ReportableException(
                "No GWAS files found on %s with pattern %s" % (
                    self.gwas_folder,
                    self.gwas_regexp.pattern,
                ))

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
Exemplo n.º 3
0
    def run(self):
        folder = os.path.split(self.output_file)[0]
        if len(folder) and not os.path.exists(folder):
            os.makedirs(folder)

        if os.path.exists(self.output_file):
            logging.info(
                "Results path %s already exists, delete it if you want it to be calculated again",
                self.output_file)
            return

        people_by_id = None
        if os.path.exists(self.selected_dosage_folder):
            logging.info("Loading people")
            samples_path = Utilities.samplesInputPath(
                self.selected_dosage_folder)
            if samples_path is not None:
                people = Person.Person.loadPeople(samples_path)
                people_by_id = {p.id: p for p in people}

        logging.info("Loading weights from database: %s" %
                     (self.weight_db_path))
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db_path)

        #Normalization is ignored at the moment. Not sure if it will return.
        results = None
        normalization = None
        results, normalization = self.resultsFromCovarianceFile(
            weight_db_logic)

        self.saveEntries(self.output_file, results)

        logging.info("Successfully ran MetaXcan analysis")
Exemplo n.º 4
0
    def run(self):
        if not self.correlation_output and not self.covariance_output:
            logging.info("Provide --correlation_output or --covariance_output or both")
            return

        logging.info("Loading Weights")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path)

        self.buildFiles(weight_db_logic)
Exemplo n.º 5
0
    def run(self):
        start = timer()

        if not self.correlation_output and not self.covariance_output:
            logging.info(
                "Provide --correlation_output or --covariance_output or both")
            return

        logging.info("Loading Weights")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path)

        logging.info("Building files")
        self.buildFiles(weight_db_logic)

        end = timer()
        logging.info("Ran successfully in %s seconds", str(end - start))
Exemplo n.º 6
0
    def run(self):
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db) if self.weight_db else None
        contents = Utilities.contentsWithPatternsFromFolder(
            self.data_folder_phase, ["gz"])

        if os.path.exists(self.output_file):
            logging.info(
                "Variance output already exists, delete it if you want stuff to be figured out again"
            )
            return

        dir = os.path.dirname(self.output_file)
        if not os.path.exists(dir):
            os.makedirs(dir)

        for content in contents:
            self.buildVarianceDB(weight_db_logic, content)
Exemplo n.º 7
0
    def run(self):
        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db_path)

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
Exemplo n.º 8
0
    def testPrediXcanLoader(self):
        weight_db = WeightDBUtilities.WeightDBEntryLogic("tests/_td/test.db")
        loader = PrediXcanFormatUtilities.PrediXcanFormatDosageLoader(
            "tests/_td/filtered_dosage/chr1.dosage.gz", weight_db)
        snps, snps_by_rsid = loader.load()

        self.assertEqual(sorted(snps_by_rsid.keys()),
                         ["rs1", "rs2", "rs3", "rs4", "rs5", "rs6"])

        self.assertEqual(snps[0].ref_allele, "T")
        self.assertEqual(snps[0].eff_allele, "C")
        self.assertEqual(snps[0].position, 1)
        self.assertEqual(snps[0].data, [0, 0, 0, 2])

        self.assertEqual(snps[1].ref_allele, "A")
        self.assertEqual(snps[1].eff_allele, "G")
        self.assertEqual(snps[1].position, 2)
        self.assertEqual(snps[1].data, [1, 1, 1, 1])

        self.assertEqual(snps[2].ref_allele, "G")
        self.assertEqual(snps[2].eff_allele, "A")
        self.assertEqual(snps[2].position, 3)
        self.assertEqual(snps[2].data, [0, 1, 0, 1])

        self.assertEqual(snps[3].ref_allele, "T")
        self.assertEqual(snps[3].eff_allele, "C")
        self.assertEqual(snps[3].position, 4)
        self.assertEqual(snps[3].data, [0, 0, 0, 0])

        self.assertEqual(snps[4].ref_allele, "C")
        self.assertEqual(snps[4].eff_allele, "T")
        self.assertEqual(snps[4].position, 5)
        self.assertEqual(snps[4].data, [0, 0, 1, 1])

        self.assertEqual(snps[5].ref_allele, "C")
        self.assertEqual(snps[5].eff_allele, "T")
        self.assertEqual(snps[5].position, 6)
        self.assertEqual(snps[5].data, [0, 0, 0, 2])
Exemplo n.º 9
0
    def buildBetas(self, db_filename):
        filebase = os.path.basename(db_filename).replace(".db", "")
        output_folder = os.path.abspath(self.args.output_directory)

        logging.info("Processing betas for %s" % (db_filename))
        self.args.weight_db_path = os.path.abspath(db_filename)
        cov_directory = self.args.covariance_directory
        if cov_directory.upper() == "SAME":
            cov_directory = "/".join(self.args.weight_db_path.split("/")[0:-1])

        extComponents = self.args.covariance_suffix.split("..")

        if len(extComponents) > 1:
            covext = "..".join(extComponents[0:-1])
            dbext = extComponents[-1]
            filebase = db_filename.replace(dbext, "")
            self.args.covariance = "%s/%s%s" % (cov_directory, filebase.split("/")[-1], covext)
        else:
            self.args.covariance = "%s/%s%s" % (
            cov_directory, filebase.strip("/")[-1], self.args.covariance_suffix)
        file_prefix = filebase.split("/")[-1].split(".")[0]
        beta_output = os.path.join(output_folder, file_prefix)
        logging.info("Writing betas to %s" % (beta_output))

        self.args.output_folder = beta_output

        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db_path)

        betaScript = M03_betas.GetBetas(self.args)
        names = Utilities.contentsWithRegexpFromFolder(self.args.gwas_folder, betaScript.gwas_regexp)

        if not os.path.exists(beta_output):
            os.makedirs(beta_output)
        betaScript.output_folder = beta_output              #os.path.join(output_folder, filebase)
        if not os.path.exists(betaScript.output_folder):
            os.makedirs(betaScript.output_folder)

        report_prefix = None
        for name in names:
            name = name + ".gz"
            if report_prefix is None:
                report_prefix = name.split("/")[-1].split(".")[0]
            try:
                betaScript.buildBetas(weight_db_logic,name)

            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass

        suffix = ".csv"
        self.args.output_file = os.path.join(output_folder,
                                             report_prefix + "-" + file_prefix + suffix)  # output_folder       #os.path.join(output_folder, file_prefix) + ".csv"

        # ZScores
        logging.info("Calculating ZScores for %s" % (filebase))
        zscoreScript = M04_zscores.CalculateZScores(self.args)
        zscoreScript.folder_beta = betaScript.output_folder
        zscoreScript.run()
Exemplo n.º 10
0
    def run(self):
        # run the main function
        if not self.covariance_output:
            logging.info("Provide --covariance_output or both")
            return

        # list all the databases in the path
        for file in sorted(os.listdir(self.db_path)):
            if file.endswith(".db") and not file.endswith("sqtl.db"):
                self.db_file_list.append(file)

        # load the database and build the separate db entry logic
        logging.info("Loading Weights")
        count = 0
        for file in self.db_file_list:
            count += 1
            filename = self.db_path + file
            self.db_logic_dict[file] = WeightDBUtilities.WeightDBEntryLogic(
                filename)
            logging.info("Building file" + str(count))

        # merge the info from different databases
        tmp_logic_object = self.db_logic_dict[list(
            self.db_logic_dict.keys())[0]]
        count = 0
        for db_logic in self.db_logic_dict.values():
            count += 1
            logging.info("Scanning file" + str(count))
            # update the weights_by_gene
            count_gene = 0
            num_gene = len(db_logic.weights_by_gene.keys())
            for gene in db_logic.weights_by_gene.keys():
                count_gene += 1
                if count_gene % 150 == 0:
                    logging.info("Percentage of genes processed  " + str(
                        round(float(count_gene) / float(num_gene), 2) * 100))
                if gene in tmp_logic_object.weights_by_gene.keys():
                    for rsid in db_logic.weights_by_gene[gene].keys():
                        if rsid not in tmp_logic_object.weights_by_gene[
                                gene].keys():
                            tmp_logic_object.weights_by_gene[gene][
                                rsid] = db_logic.weights_by_gene[gene][rsid]
                else:
                    tmp_logic_object.weights_by_gene[
                        gene] = db_logic.weights_by_gene[gene]

        # summary of gene count and snp count
        logging.info("Total Genes:" +
                     str(len(tmp_logic_object.weights_by_gene.keys())))
        rsid_count = 0
        for gene in tmp_logic_object.weights_by_gene.keys():
            rsid_count += len(tmp_logic_object.weights_by_gene[gene].keys())
        logging.info("Total SNPs:" + str(rsid_count))

        # store the pickle file
        pickle_out = open("db_weight_logic.pickle", "wb")
        pickle.dump(tmp_logic_object, pickle_out)
        pickle_out.close()

        # store the gene info
        self.saveGeneInfo(tmp_logic_object)

        # whether calculate the covariance directly
        # store the database entry logic as pickle file
        if not self.store_pickle_only:
            self.buildFiles(tmp_logic_object)

        logging.info("Ran successfully")