예제 #1
0
def run_metaxcan(args, context):
    logging.info("Started metaxcan association")
    model_snps = context.get_model_snps()
    total_snps = len(model_snps)
    snps_found=set()
    reporter = Utilities.PercentReporter(logging.INFO, total_snps)

    i_genes, i_snps = context.get_data_intersection()

    results = []
    for gene in i_genes:
        logging.log(7, "Processing gene %s", gene)
        r, snps = AssociationCalculation.association(gene, context, return_snps=True)
        results.append(r)
        snps_found.update(snps)
        reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study")

    reporter.update(len(snps_found), "%d %% of model's snps used", force=True)

    results = AssociationCalculation.dataframe_from_results(results)
    results = MetaxcanUtilities.format_output(results, context, args.remove_ens_version)

    if args.output_file:
        Utilities.ensure_requisite_folders(args.output_file)
        results.to_csv(args.output_file, index=False)

    return results
예제 #2
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists, delete it or move it if you want it generated again")
        return

    Utilities.ensure_requisite_folders(args.output)

    logging.info("Reading input")
    data = pandas.read_table(args.input)

    logging.info("Opening output")
    f = h5py_cache.File(args.output, 'w', chunk_cache_mem_size=int(50 * (1024 ** 2)))

    n_genes = data.shape[1]-2
    n_samples = data.shape[0]
    n_genes_chunk = np.min((n_genes, 10))

    logging.info("Processing expression")
    p = f.create_dataset("pred_expr", shape=(n_genes, n_samples),
                                        chunks=(n_genes_chunk, n_samples),
                                        dtype=np.dtype('float32'), scaleoffset=4, compression='gzip')
    g = f.create_dataset("genes", (n_genes,), dtype="S30")

    for i, gene in enumerate(data.columns.values[2:]):
        p[i, :] = data[gene].to_numpy()
        g[i] = np.string_(gene)

    logging.info("saving samples")
    s = f.create_dataset("samples", (n_samples,), dtype="S25")
    for i in xrange(0, n_samples):
        s[i] = np.string_(data["IID"][i])
    f.close()
    logging.info("Done")
예제 #3
0
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.output)
        return
    logging.info("Creating context")
    context = CrossModelUtilities.context_from_args(args)
    results = []

    n_genes = context.get_n_genes()
    reporter = Utilities.PercentReporter(logging.INFO, n_genes)

    logging.info("Processing")

    reporter.update(0, "%d %% of model's genes processed so far")
    for i,gene in enumerate(context.get_genes()):
        logging.log(7, "Gene %d/%d: %s", i+1, n_genes, gene)
        result = JointAnalysis.joint_analysis(context, gene)
        results.append(result)
        reporter.update(i, "%d %% of model's genes processed so far")

    results = JointAnalysis.format_results(results)
    Utilities.ensure_requisite_folders(args.output)
    results.to_csv(args.output, index=False, sep="\t")

    end = timer()
    logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
    def run(self):
        if os.path.exists(self.output_file):
            logging.info("File %s already exists, delete it if you want it calculated again", self.output_file)
            return

        logging.info("Opening %s", self.weight_db)
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.db_path)

        CHROMOSOMES = ["chr"+str(x) for x in xrange(1, 23)]

        dosage_names = Utilities.dosageNamesFromFolder(self.data_folder_gwas_dosage)
        legend_names = Utilities.legendNamesFromFolder(self.data_folder_phase)

        findings={}
        for chromosome in CHROMOSOMES:
            logging.info("Processing chromosome %s", chromosome)
            dosage_name = Utilities.removeNameWithPatterns(dosage_names, [chromosome+"."])
            dosage = self.loadDosageFile(self.data_folder_gwas_dosage, dosage_name)
            self.processDosage(chromosome, weight_db_logic, dosage, findings)

            legend_name = Utilities.removeNameEndingWith(legend_names, chromosome)
            self.processLegendName(chromosome, weight_db_logic, dosage, findings, legend_name)


        with open(self.output_file, "w") as file:
            file.write(AlleleStats.CSVHeader())

            def sortByChromosome(finding):
                return finding.chromosome
            entries = sorted(findings.values(), key=sortByChromosome)
            for finding in entries:
                line = finding.toCSVLine()
                file.write(line)
예제 #5
0
def run(args, _gwas=None):
    start = timer()
    if not args.overwrite and os.path.exists(args.output_file):
        logging.info("%s already exists, move it or delete it if you want it done again", args.output_file)
        return
    logging.info("Started metaxcan association")

    context = MetaxcanUtilities.build_context(args, _gwas)

    model_snps = context.get_model_snps()
    total_snps = len(model_snps)
    snps_found=set()
    reporter = Utilities.PercentReporter(logging.INFO, total_snps)

    i_genes, i_snps = context.get_data_intersection()

    results = []
    for gene in i_genes:
        r, snps = AssociationCalculation.association(gene, context, return_snps=True)
        results.append(r)
        snps_found.update(snps)
        reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study")

    Utilities.ensure_requisite_folders(args.output_file)

    reporter.update(len(snps_found), "%d %% of model's snps used", force=True)
    results = AssociationCalculation.dataframe_from_results(zip(*results))
    results = MetaxcanUtilities.format_output(results, context, args.keep_ens_version)
    results.to_csv(args.output_file, index=False)
    end = timer()
    logging.info("Sucessfully processed metaxcan association in %s seconds"%(str(end - start)))
예제 #6
0
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info(
            "%s already exists, you have to move it or delete it if you want it done again",
            args.output)
        return
    logging.info("Creating context")
    context = CrossModelUtilities.context_from_args(args)
    results = []

    n_genes = context.get_n_genes()
    reporter = Utilities.PercentReporter(logging.INFO, n_genes)

    logging.info("Processing")
    reporter.update(0, "%d %% of model's genes processed so far")
    for i, gene in enumerate(context.get_genes()):
        logging.log(7, "Gene %d/%d: %s", i + 1, n_genes, gene)
        result = JointAnalysis.joint_analysis(context, gene)
        results.append(result)
        reporter.update(i, "%d %% of model's genes processed so far")

    results = JointAnalysis.format_results(results)
    Utilities.ensure_requisite_folders(args.output)
    results.to_csv(args.output, index=False, sep="\t")

    end = timer()
    logging.info("Ran multi tissue in %s seconds" % (str(end - start)))
예제 #7
0
파일: PrediXcan.py 프로젝트: ly-0/MetaXcan
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.output)
        return

    if (args.hdf5_expression_file and args.expression_file) or \
        (not args.hdf5_expression_file and not args.expression_file):
        logging.info("Provide either hdf5 expression file or plain text expression file")
        return

    with PrediXcanUtilities.p_context_from_args(args) as context:
        genes = context.get_genes()
        n_genes = len(genes)
        reporter = Utilities.PercentReporter(logging.INFO, n_genes)
        reporter.update(0, "%d %% of model's genes processed so far", force=True)
        results = []
        for i,gene in enumerate(genes):
            logging.log(7, "Processing gene %s", gene)
            r = PrediXcanAssociation.predixcan_association(gene, context)
            results.append(r)
            reporter.update(i, "%d %% of model's genes processed so far")
        reporter.update(i, "%d %% of model's genes processed so far")
        results = PrediXcanAssociation.dataframe_from_results(results)
        results = results.fillna("NA")
        results = results.sort_values(by="pvalue")

        Utilities.save_dataframe(results, args.output)

    end = timer()
    logging.info("Ran multi tissue predixcan in %s seconds" % (str(end - start)))
예제 #8
0
    def testContentsWithPatternsFromFolders(self):
        contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["sample", "Fail"])
        contents = {c for c in contents}
        self.assertEqual(contents, set([]))

        contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["set", "sample"])
        contents = {c for c in contents}
        self.assertEqual(contents, {"set.sample"})
예제 #9
0
    def testCheckSubdirectorySanity(self):
        b = Utilities.checkSubdirectorySanity("tests", "tests")
        self.assertFalse(b)

        b = Utilities.checkSubdirectorySanity("tests", "tests/_td")
        self.assertTrue(b)

        b = Utilities.checkSubdirectorySanity("tests/_td", "tests")
        self.assertFalse(b)
예제 #10
0
    def testCheckSubdirectorySanity(self):
        b = Utilities.checkSubdirectorySanity("tests", "tests")
        self.assertFalse(b)

        b = Utilities.checkSubdirectorySanity("tests", "tests/_td")
        self.assertTrue(b)

        b = Utilities.checkSubdirectorySanity("tests/_td", "tests")
        self.assertFalse(b)
예제 #11
0
def _run(args, subset=None, append=None):
    logging.info("Loading expressions")
    manager = FeatureMatrix.build_manager(args.expression_folder, filters = args.expression_filters, standardize=True, subset=subset)

    logging.info("Saving")
    Utilities.ensure_requisite_folders(args.output)
    manager.save_covariances(args.output, append=append)

    logging.info("Ran.")
예제 #12
0
def _run(args, subset=None, append=None):
    logging.info("Loading expressions")
    manager = FeatureMatrix.build_manager(args.expression_folder,
                                          filters=args.expression_filters,
                                          standardize=True,
                                          subset=subset)

    logging.info("Saving")
    Utilities.ensure_requisite_folders(args.output)
    manager.save_covariances(args.output, append=append)

    logging.info("Ran.")
예제 #13
0
    def run(self):
        folder = os.path.split(self.output_file)[0]
        if len(folder) and not os.path.exists(folder):
            os.makedirs(folder)

        if os.path.exists(self.output_file):
            logging.info(
                "Results path %s already exists, delete it if you want it to be calculated again",
                self.output_file)
            return

        people_by_id = None
        if os.path.exists(self.selected_dosage_folder):
            logging.info("Loading people")
            samples_path = Utilities.samplesInputPath(
                self.selected_dosage_folder)
            if samples_path is not None:
                people = Person.Person.loadPeople(samples_path)
                people_by_id = {p.id: p for p in people}

        logging.info("Loading weights from database: %s" %
                     (self.weight_db_path))
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db_path)

        #Normalization is ignored at the moment. Not sure if it will return.
        results = None
        normalization = None
        results, normalization = self.resultsFromCovarianceFile(
            weight_db_logic)

        self.saveEntries(self.output_file, results)

        logging.info("Successfully ran MetaXcan analysis")
예제 #14
0
def get_name_prefix(args):
    regexp = re.compile(
        args.gwas_file_pattern) if args.gwas_file_pattern else None
    names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
    name = names[0]
    report_prefix = name.split("/")[-1].split(".")[0]
    return report_prefix
예제 #15
0
def readGWAS(args):
    start = timer()
    validate(args)
    regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else  None
    names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
    names.sort() #cosmetic, because different filesystems/OS yield folders in different order

    if len(names) == 0:
        msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,)
        raise Exceptions.ReportableException(msg)
    
    print "INFO: Reading GWAS data"
    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    #model = PredictionModel.load_model(args.model_db_path) if args.model_db_path else None
    model = None
    # dataframe
    r = pandas.DataFrame()
    for name in names:
        b = build_betas(args, model, gwas_format, name)
        r = pandas.concat([r,b])
    end = timer()
    logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start)))
    print("Successfully parsed input gwas in %s seconds"%(str(end-start)))
    return r
예제 #16
0
    def run(self):
        folder = os.path.split(self.output_file)[0]
        if len(folder) and not os.path.exists(folder):
            os.makedirs(folder)

        if os.path.exists(self.output_file):
            logging.info("Results path %s already exists, delete it if you want it to be calculated again", self.output_file)
            return

        logging.info("Loading people")
        people_by_id = None
        if os.path.exists(self.selected_dosage_folder):
            samples_path = Utilities.samplesInputPath(self.selected_dosage_folder)
            if samples_path is not None:
                people = Person.Person.loadPeople(samples_path)
                people_by_id = {p.id:p for p in people}

        logging.info("Loading weights from database: %s" % (self.weight_db_path))
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path)

        results = None
        normalization = None
        results, normalization = self.resultsFromCovarianceFile(weight_db_logic)

        self.saveEntries(self.output_file, results, normalization)
예제 #17
0
    def processIMPUTEFiles(self):
        logging.info("Loading people")
        names = Utilities.hapNamesFromFolder(self.dosage_folder)
        all_people = Person.Person.loadPeople(self.samples_input)

        selected_people = Person.Person.loadPeople(self.samples_output, delim=" ")
        selected_people_by_id = {p.id:p for p in selected_people}

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(self.snp_list)
        snp_dict = {rsid:True for rsid in snp_data_set.data}

        for name in names:
            output = os.path.join(self.output_folder, name)
            filter = ThousandGenomesUtilities.IMPUTEFilteredDosageFileBuilder()
            filter.base_path = self.dosage_folder
            filter.name = name
            filter.output_pattern = output
            filter.snp_dict = snp_dict
            filter.all_people = all_people
            filter.selected_people_by_id = selected_people_by_id

            if self.output_format == Formats.IMPUTE:
                filter.buildIMPUTE()
            elif self.output_format == Formats.PrediXcan:
                search = self.chromosome_in_name_regex.search(name)
                exitIf(search is None, Exceptions.InvalidInputFormat, \
                             "No files found in '%s' that match the pattern, '%s'" \
                             % (self.dosage_folder, self.chromosome_in_name_regex.pattern))
                chr = search.group(1)
                filter.chromosome_name = chr
                filter.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
예제 #18
0
    def test_gtex_geno_lines_generator(self):
        data = []
        for i, line in enumerate(
                GTExGenotype.gtex_geno_lines(
                    "tests/_td/genotype/gtex_like.txt.gz",
                    "tests/_td/genotype/gtex_snp.txt.gz")):
            data.append(line)

        header = GTExGenotype.gtex_geno_header(
            "tests/_td/genotype/gtex_like.txt.gz")
        gtex_ids = header[1:]
        header = [
            "rsid", "chromosome", "position", "ref_allele", "alt_allele",
            "frequency"
        ] + gtex_ids
        dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore")

        gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz")
        dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz")
        dataframe_2 = pandas.merge(dataframe_2,
                                   gtex_snp,
                                   left_on="Id",
                                   right_on="VariantID")

        compare_data_frames(dataframe, dataframe_2, gtex_ids)
    def buildFiles(self, weight_db_logic):
        do_correlations = self.correlation_output is not None
        if do_correlations:
            if os.path.exists(self.correlation_output):
                logging.info("%s already exists, delete it if you want it figured out again", self.correlation_output)
                do_correlations = False
            else:
                correlation_dir = os.path.dirname(self.correlation_output)
                if not os.path.exists(correlation_dir):
                    os.makedirs(correlation_dir)
                self.writeFileHeader(self.correlation_output)

        do_covariances = self.covariance_output is not None
        if do_covariances:
            if os.path.exists(self.covariance_output):
                logging.info("%s already exists, delete it if you want it figured out again", self.covariance_output)
                do_covariances = False
            else:
                covariance_dir = os.path.dirname(self.covariance_output)
                if not os.path.exists(covariance_dir):
                    os.makedirs(covariance_dir)
                self.writeFileHeader(self.covariance_output)

        if not do_covariances and not do_correlations:
            return

        names = Utilities.dosageNamesFromFolder(self.data_folder)
        for name in names:
            snps, snps_by_rsid = self.getSNPS(name, weight_db_logic)
            if do_correlations:
                self.addToCorrelationFile(weight_db_logic, name, snps, snps_by_rsid)

            if do_covariances:
                self.addToCovarianceFile(weight_db_logic, name, snps, snps_by_rsid)
예제 #20
0
    def buildFiles(self, weight_db_logic):
        do_correlations = self.correlation_output is not None
        if do_correlations:
            if os.path.exists(self.correlation_output):
                logging.info("%s already exists, delete it if you want it figured out again", self.correlation_output)
                do_correlations = False
            else:
                correlation_dir = os.path.dirname(self.correlation_output)
                if not os.path.exists(correlation_dir):
                    os.makedirs(correlation_dir)
                self.writeFileHeader(self.correlation_output)

        do_covariances = self.covariance_output is not None
        if do_covariances:
            if os.path.exists(self.covariance_output):
                logging.info("%s already exists, delete it if you want it figured out again", self.covariance_output)
                do_covariances = False
            else:
                covariance_dir = os.path.dirname(self.covariance_output)
                if not os.path.exists(covariance_dir):
                    os.makedirs(covariance_dir)
                self.writeFileHeader(self.covariance_output)

        if not do_covariances and not do_correlations:
            return

        names = Utilities.dosageNamesFromFolder(self.data_folder)
        for name in names:
            snps, snps_by_rsid = self.getSNPS(name, weight_db_logic)
            if do_correlations:
                self.addToCorrelationFile(weight_db_logic, name, snps, snps_by_rsid)

            if do_covariances:
                self.addToCovarianceFile(weight_db_logic, name, snps, snps_by_rsid)
예제 #21
0
    def processPrediXcanFiles(self):
        logging.info("Loading people")
        all_people = Person.Person.loadPeople(self.samples_input, '\t', False)
        selected_people = Person.Person.loadPeople(self.samples_output)
        selected_people_by_id = {p.id: p for p in selected_people}
        logging.info("%d total people, %d selected", len(all_people),
                     len(selected_people_by_id))

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(
            self.snp_list)
        snp_dict = {k: True for k in snp_data_set.data}
        print len(snp_dict.keys())

        contents = Utilities.contentsWithPatternsFromFolder(
            self.dosage_folder, ["dosage.txt.gz"])
        for content_name in contents:
            input_path = os.path.join(self.dosage_folder, content_name)
            fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(
                input_path, self.output_folder, content_name, all_people,
                selected_people_by_id, snp_dict)
            if self.output_format == Formats.IMPUTE:
                fileBuilder.buildIMPUTE()
            if self.output_format == Formats.PrediXcan:
                fileBuilder.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
예제 #22
0
def run_additional(args, context):
    logging.info("Started metaxcan additional stats")
    i_genes, i_snps = context.get_data_intersection()
    results = []
    for gene in i_genes:
        stats_ = AssociationCalculation.additional_stats(gene, context)
        results.append(stats_)

    results = AssociationCalculation.dataframe_from_aditional_stats(results)
    results = MetaxcanUtilities.format_additional_output(results, context, args.remove_ens_version)

    if args.additional_output:
        Utilities.ensure_requisite_folders(args.additional_output)
        results.to_csv(args.additional_output, index=False)

    return results
def run(args):
    logging.info("Loading weight db")
    weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(args.weight_db_path)

    logging.info("Loading covariance file")
    covariance_contents = MatrixUtilities.loadMatrixFromFile(args.covariance)

    logging.info("Choosing method")
    beta_contents = Utilities.contentsWithPatternsFromFolder(args.beta_folder, [])
    zscore_calculation, normalization = MethodGuessing.chooseZscoreSchemeFromFiles(args.beta_folder, beta_contents, covariance_contents, weight_db_logic)

    logging.info("Processing")
    betas = {}
    for content in beta_contents:
        logging.info("Loading betas")
        beta_path = os.path.join(args.beta_folder, content)
        beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="")
        beta_sets = {set.name: set for set in beta_sets}
        betas[content] = beta_sets

    if args.gene_name:
        try:
            gene_data, weights, covariance_matrix, valid_rsids, beta_sets = get_gene_data(args.gene_name, weight_db_logic, covariance_contents, betas)
            weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights, beta_sets)
            if args.interactive:
                embed()
            logging.info("Processed gene data")
        except Exception as e:
            logging.info("Couldn't get gene data")
            embed()
예제 #24
0
    def run(self):
        if self.args.weight_db_path:
            logging.info("Loading weight model")
            weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
                self.weight_db_path)
        else:
            weight_db_logic = None

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        if len(names) == 0:
            raise Exceptions.ReportableException(
                "No GWAS files found on %s with pattern %s" % (
                    self.gwas_folder,
                    self.gwas_regexp.pattern,
                ))

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
예제 #25
0
 def torture_dosage(metadata, dosage, gtex_ids):
     d = [dosage[x] for x in metadata.rsid]
     d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore")
     d["rsid"] = list(metadata.rsid)
     d = pandas.merge(metadata, d, on="rsid")
     d["number"] = list(range(0, len(d)))
     d = d.set_index("number")
     return d
예제 #26
0
 def torture_dosage(metadata, dosage, gtex_ids):
     d = [dosage[x] for x in metadata.rsid]
     d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore")
     d["rsid"] = list(metadata.rsid)
     d = pandas.merge(metadata, d, on="rsid")
     d["number"] = range(0, len(d))
     d = d.set_index("number")
     return d
예제 #27
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(
            args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder,
                                                       regexp)
        names.sort(
        )  #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (
                args.gwas_folder,
                args.gwas_file_pattern,
            )
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(
        args.model_db_path,
        args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info(
                    "%s already exists, delete it if you want it to be done again",
                    output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %
                     (str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r, b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds" %
                     (str(end - start)))

        return r
예제 #28
0
def run_metaxcan(args, context):
    logging.info("Started metaxcan association")
    model_snps = context.get_model_snps()
    total_snps = len(model_snps)
    snps_found = set()
    reporter = Utilities.PercentReporter(logging.INFO, total_snps)

    i_genes, i_snps = context.get_data_intersection()

    results = []
    additional = []
    for i, gene in enumerate(i_genes):
        if args.MAX_R and i + 1 > args.MAX_R:
            logging.log("Early exit condition met")
            break
        logging.log(9, "Processing gene %i:%s", i, gene)
        r, snps = AssociationCalculation.association(gene,
                                                     context,
                                                     return_snps=True)
        results.append(r)
        snps_found.update(snps)
        reporter.update(
            len(snps_found),
            "%d %% of model's snps found so far in the gwas study")
        if args.additional_output:
            stats_ = AssociationCalculation.additional_stats(gene, context)
            additional.append(stats_)

    reporter.update(len(snps_found), "%d %% of model's snps used", force=True)

    results = AssociationCalculation.dataframe_from_results(results)
    results = MetaxcanUtilities.format_output(results, context,
                                              args.remove_ens_version)

    if args.additional_output:
        additional = AssociationCalculation.dataframe_from_aditional_stats(
            additional)
        results = MetaxcanUtilities.merge_additional_output(
            results, additional, context, args.remove_ens_version)

    if args.output_file:
        Utilities.ensure_requisite_folders(args.output_file)
        results.to_csv(args.output_file, index=False)

    return results
예제 #29
0
def get_name_prefix(args):
    if args.gwas_folder:
        regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
        name = names[0]
        report_prefix = get_result_prefix(args, name)
    else:
        report_prefix = get_result_prefix(args, args.gwas_file)
    return report_prefix
예제 #30
0
def get_name_prefix(args):
    if args.gwas_folder:
        regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
        name = names[0]
        report_prefix = get_result_prefix(args, name)
    else:
        report_prefix = get_result_prefix(args, args.gwas_file)
    return report_prefix
예제 #31
0
def run(args):
    start = timer()

    folder, prefix = os.path.split(args.output_prefix)
    results_name = args.output_prefix + "__mt_results.txt"
    predixcan_results_name = args.output_prefix + "__p_results.txt"
    additional_name = args.output_prefix + "__additional.txt"

    if os.path.exists(results_name):
        logging.info(
            "%s already exists, you have to move it or delete it if you want it done again",
            results_name)
        return

    #for reproducibility
    numpy.random.seed(100)

    results = []
    additional = []
    predixcan_results = []

    n_max = args.max_n_results
    logging.info("Acquiring context")
    with MultiPredixcanSimulations.context_from_args(args) as context:
        logging.info("processing")
        _c, _cp, _e = context.get_mp_simulation(None)
        for i, gene in enumerate(context.get_genes()):
            if n_max and i + 1 > n_max:
                logging.info("Max runs met")
                break
            logging.log(9, "%d Gene %s", i, gene)
            r, add, p = MultiPredixcanSimulations.simulate(gene, context)
            if r is None:
                logging.log(9, "%s could not be simulated", gene)
                continue
            results.append(r)
            additional.append(add)

            if p is not None:
                predixcan_results.append(p)

    results = MultiPrediXcanAssociation.dataframe_from_results(
        results, _c).sort_values(by="pvalue")
    additional = pandas.concat(additional)

    Utilities.ensure_requisite_folders(results_name)
    Utilities.save_dataframe(results, results_name)
    Utilities.save_dataframe(additional, additional_name)

    if len(predixcan_results):
        predixcan_results = pandas.concat(predixcan_results)
        Utilities.save_dataframe(predixcan_results, predixcan_results_name)
    logging.info("Finished")
예제 #32
0
    def resultsFromCovarianceFile(self, weight_db_logic):
        results = {}

        logging.info("Loading covariance file from %s", self.covariance)
        covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance)
        #Keep only covariances present in gene models
        covariance_contents = {k:v for k,v in covariance_contents.iteritems() if k in weight_db_logic.weights_by_gene}

        beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, [])
        zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic)

        total_entries = len(weight_db_logic.genes_for_an_rsid)
        snps_found = set()
        reporter = Utilities.PercentReporter(logging.INFO, total_entries)
        for beta_name in beta_contents:
            logging.info("Processing %s", beta_name)

            beta_path = os.path.join(self.folder_beta, beta_name)

            beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="")
            beta_sets = {set.name:set for set in beta_sets }
            key, check = beta_sets.iteritems().next()
            normalization.update(beta_sets)

            for gene, entry in covariance_contents.iteritems():
                #So, new covariance files might actually have more genes than those in the database
                if not gene in weight_db_logic.weights_by_gene:
                    logging.log(8, "Gene %s not in weights", gene)
                    continue

                weights = weight_db_logic.weights_by_gene[gene]
                present = [rsid for rsid,weight in weights.iteritems() if rsid in check.values_by_key]
                if len(present) == 0:
                    logging.log(5, "No rsid in beta file for %s", gene)
                    continue

                if gene in results:
                    logging.info("Gene %s already processed", gene)
                    continue

                covariance_matrix = entry[0]
                valid_rsids = entry[1]

                logging.log(7, "Calculating z score for %s", gene)

                pre_zscore, n, VAR_g, effect_size = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids)
                results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g, effect_size)

                snps_found.update(present)
                reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study")

        #second pass, for genes not in any beta file
        self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation)
        normalization_constant = normalization.calculateNormalization()
        return results, normalization_constant
예제 #33
0
    def resultsFromCovarianceFile(self, weight_db_logic):
        results = {}

        logging.info("Loading covariance file")
        covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance)

        beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, [])
        zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic)

        total_entries = len(covariance_contents)
        reporter = Utilities.PercentReporter(logging.INFO, total_entries)
        i=0
        for beta_name in beta_contents:
            logging.info("Processing %s", beta_name)

            beta_path = os.path.join(self.folder_beta, beta_name)

            beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="")
            beta_sets = {set.name:set for set in beta_sets }
            key, check = beta_sets.iteritems().next()
            normalization.update(beta_sets)

            for gene, entry in covariance_contents.iteritems():
                weights = weight_db_logic.weights_by_gene[gene]
                process = False
                for rsid, weight in weights.iteritems():
                    if rsid in check.values_by_key:
                        process = True
                        break

                if not process:
                    logging.log(5, "No rsid in beta file for %s", gene)
                    continue

                if gene in results:
                    logging.info("Gene %s already processed", gene)
                    continue

                reporter.update(i, "%d %% of model's snp information found so far in the gwas study") #proxied by percenteage of genes

                covariance_matrix = entry[0]
                valid_rsids = entry[1]

                logging.log(7, "Calculating z score for %s", gene)

                pre_zscore, n, VAR_g = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids)
                results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g)
                i+=1

        #second pass, for genes not in any beta file
        self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation)
        normalization_constant = normalization.calculateNormalization()
        return results, normalization_constant
예제 #34
0
def run(args):
    start = timer()

    folder, prefix = os.path.split(args.output_prefix)
    results_name = args.output_prefix + "__mt_results.txt"
    predixcan_results_name = args.output_prefix + "__p_results.txt"
    additional_name = args.output_prefix + "__additional.txt"

    if os.path.exists(results_name):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", results_name)
        return

    #for reproducibility
    numpy.random.seed(100)

    results = []
    additional = []
    predixcan_results = []

    n_max = args.max_n_results
    logging.info("Acquiring context")
    with MultiPredixcanSimulations.context_from_args(args) as context:
        logging.info("processing")
        _c, _cp, _e = context.get_mp_simulation(None)
        for i, gene in enumerate(context.get_genes()):
            if n_max and i+1>n_max:
                logging.info("Max runs met")
                break
            logging.log(9, "%d Gene %s", i, gene)
            r, add, p = MultiPredixcanSimulations.simulate(gene, context)
            if r is None:
                logging.log(9, "%s could not be simulated", gene)
                continue
            results.append(r)
            additional.append(add)

            if p is not None:
                predixcan_results.append(p)

    results = MultiPrediXcanAssociation.dataframe_from_results(results, _c).sort_values(by="pvalue")
    additional = pandas.concat(additional)

    Utilities.ensure_requisite_folders(results_name)
    Utilities.save_dataframe(results, results_name)
    Utilities.save_dataframe(additional, additional_name)

    if len(predixcan_results):
        predixcan_results = pandas.concat(predixcan_results)
        Utilities.save_dataframe(predixcan_results, predixcan_results_name)
    logging.info("Finished")
 def getSNPS(self, name, weight_db_logic):
     dosageLoader = None
     if self.input_format == Formats.IMPUTE:
         dosageLoader = ThousandGenomesUtilities.IMPUTEDosageLoader(self.data_folder, name) #outdated code
     elif self.input_format == Formats.PrediXcan:
         dosageName = Utilities.dosageName(name)
         path = os.path.join(self.data_folder, dosageName)
         dosageLoader = PrediXcanFormatUtilities.PrediXcanFormatDosageLoader(path, weight_db_logic)
     else:
         logging.info("Invalid input format: %s", self.input_format)
         return
     snps, snps_by_rsid = dosageLoader.load()
     return snps, snps_by_rsid
예제 #36
0
 def getSNPS(self, name, weight_db_logic):
     dosageLoader = None
     if self.input_format == Formats.IMPUTE:
         dosageLoader = ThousandGenomesUtilities.IMPUTEDosageLoader(self.data_folder, name) #outdated code
     elif self.input_format == Formats.PrediXcan:
         dosageName = Utilities.dosageName(name)
         path = os.path.join(self.data_folder, dosageName)
         dosageLoader = PrediXcanFormatUtilities.PrediXcanFormatDosageLoader(path, weight_db_logic)
     else:
         logging.info("Invalid input format: %s", self.input_format)
         return
     snps, snps_by_rsid = dosageLoader.load()
     return snps, snps_by_rsid
예제 #37
0
def run(args):
    if os.path.exists(args.snp_covariance_output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output)
        return

    start = timer()

    logging.info("Loading models...")
    model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern)
    all_snps = model_manager.get_rsids()

    logging.info("processing genotype")
    for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps):
        logging.log(9, "Processing chromosome %s", str(chromosome))
        covariance_results = pandas.DataFrame()

        context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager)
        genes = context.get_genes()
        reporter = Utilities.PercentReporter(9, len(genes))
        reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome))
        for i,gene in enumerate(genes):
            logging.log(6, "%d/%d:%s", i+1, len(genes), gene)
            cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene)
            cov_data = MatrixManager._flatten_matrix_data([cov_data])
            cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA")
            covariance_results = pandas.concat([covariance_results, cov_data])

            reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome))

        reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome))

        logging.log(9, "writing chromosome results")
        Utilities.save_dataframe(covariance_results, args.snp_covariance_output,
                                    mode="w" if chromosome ==1 else "a",
                                    header=chromosome==1)

    end = timer()
    logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
예제 #38
0
def run(args):
    start = timer()
    if os.path.exists(args.output):
        logging.info(
            "%s already exists, you have to move it or delete it if you want it done again",
            args.output)
        return

    if (args.hdf5_expression_folder and args.expression_folder) or \
        (not args.hdf5_expression_folder and not args.expression_folder):
        logging.info(
            "Provide either hdf5 expression folder or plain text expression folder"
        )
        return

    with MultiPrediXcanUtilities.mp_context_from_args(args) as context:
        genes = context.get_genes()
        n_genes = len(genes)
        reporter = Utilities.PercentReporter(logging.INFO, n_genes)
        reporter.update(0,
                        "%d %% of model's genes processed so far",
                        force=True)

        results = []
        callbacks = {}
        if args.coefficient_output:
            callbacks["coefficient"] = MultiPrediXcanAssociation.SaveCoefs()
        if args.loadings_output:
            callbacks["loadings"] = MultiPrediXcanAssociation.SaveLoadings()

        for i, gene in enumerate(genes):
            logging.log(7, "Processing gene %i/%i: %s", i + 1, n_genes, gene)
            r = MultiPrediXcanAssociation.multi_predixcan_association(
                gene, context, callbacks.values())
            results.append(r)
            reporter.update(i, "%d %% of model's genes processed so far")
        reporter.update(i, "%d %% of model's genes processed so far")
        results = MultiPrediXcanAssociation.dataframe_from_results(
            results, context)
        results = results.fillna("NA")
        results = results.sort_values(by="pvalue")

        Utilities.save_dataframe(results, args.output)
        if args.coefficient_output:
            Utilities.save_dataframe(callbacks["coefficient"].get(),
                                     args.coefficient_output)
        if args.loadings_output:
            Utilities.save_dataframe(callbacks["loadings"].get(),
                                     args.loadings_output)

    end = timer()
    logging.info("Ran multi tissue predixcan in %s seconds" %
                 (str(end - start)))
예제 #39
0
    def __init__(self, args):
        self.dosage_folder = args.dosage_folder
        self.snp_list = args.snp_list
        self.output_folder = args.output_folder
        self.input_format = args.input_format
        self.output_format = args.output_format
        self.population_group_filters = args.population_group_filters
        self.individual_filters = [re.compile(x) for x in args.individual_filters]

        self.chromosome_in_name_regex = re.compile(args.file_pattern)

        self.samples_input = Utilities.samplesInputPath(self.dosage_folder)
        samples_name = os.path.split(self.samples_input)[1]
        self.samples_output = os.path.join(self.output_folder, samples_name)
예제 #40
0
    def run(self):
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db) if self.weight_db else None
        contents = Utilities.contentsWithPatternsFromFolder(self.data_folder_phase, ["gz"])

        if os.path.exists(self.output_file):
            logging.info("Correlations output already exists, delete it if you want stuff to be figured out again")
            return

        dir = os.path.dirname(self.output_file)
        if not os.path.exists(dir):
            os.makedirs(dir)

        for content in contents:
            self.buildVarianceDB(weight_db_logic,content)
예제 #41
0
def run(args):
    if os.path.exists(args.snp_covariance_output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output)
        return

    start = timer()

    logging.info("Loading models...")
    model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern, name_filter=args.models_filter)
    all_snps = model_manager.get_rsids()
    Utilities.ensure_requisite_folders(args.snp_covariance_output)
    with gzip.open(args.snp_covariance_output, "w") as o:
        o.write("GENE\tRSID1\tRSID2\tVALUE\n")
        logging.info("processing genotype")

        for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps):
            logging.log(9, "Processing chromosome %s", str(chromosome))

            context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager)
            genes = context.get_genes()
            reporter = Utilities.PercentReporter(9, len(genes))
            reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome))
            for i,gene in enumerate(genes):
                logging.log(6, "%d/%d:%s", i+1, len(genes), gene)
                cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene)
                cov_data = MatrixManager._flatten_matrix_data([cov_data])
                for e in cov_data:
                    l = "{}\t{}\t{}\t{}\n".format(e[0], e[1], e[2], e[3])
                    o.write(l)

                reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome))

            reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome))

    end = timer()
    logging.info("Ran covariance builder in %s seconds" % (str(end - start)))
예제 #42
0
    def __init__(self, args):
        self.dosage_folder = args.dosage_folder
        self.snp_list = args.snp_list
        self.output_folder = args.output_folder
        self.input_format = args.input_format
        self.output_format = args.output_format
        self.population_group_filters = args.population_group_filters
        self.individual_filters = [
            re.compile(x) for x in args.individual_filters
        ]

        self.chromosome_in_name_regex = re.compile(args.file_pattern)

        self.samples_input = Utilities.samplesInputPath(self.dosage_folder)
        samples_name = os.path.split(self.samples_input)[1]
        self.samples_output = os.path.join(self.output_folder, samples_name)
예제 #43
0
    def test_gtex_geno_lines_generator(self):
        data = []
        for i, line in enumerate(GTExGenotype.gtex_geno_lines("tests/_td/genotype/gtex_like.txt.gz", "tests/_td/genotype/gtex_snp.txt.gz")):
            data.append(line)

        header = GTExGenotype.gtex_geno_header("tests/_td/genotype/gtex_like.txt.gz")
        gtex_ids = header[1:]
        header = ["rsid", "chromosome", "position", "ref_allele", "alt_allele", "frequency"]+gtex_ids
        dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore")


        gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz")
        dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz")
        dataframe_2 = pandas.merge(dataframe_2,gtex_snp, left_on="Id", right_on="VariantID")

        compare_data_frames(dataframe, dataframe_2, gtex_ids)
예제 #44
0
    def run(self):
        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path)

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        for name in names:
            try:
                self.buildBetas(weight_db_logic,name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
예제 #45
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else  None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
        names.sort() #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,)
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info("%s already exists, delete it if you want it to be done again", output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %(str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r,b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start)))

        return r
예제 #46
0
    def run(self):
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db) if self.weight_db else None
        contents = Utilities.contentsWithPatternsFromFolder(
            self.data_folder_phase, ["gz"])

        if os.path.exists(self.output_file):
            logging.info(
                "Variance output already exists, delete it if you want stuff to be figured out again"
            )
            return

        dir = os.path.dirname(self.output_file)
        if not os.path.exists(dir):
            os.makedirs(dir)

        for content in contents:
            self.buildVarianceDB(weight_db_logic, content)
예제 #47
0
    def run(self):
        logging.info("Loading weight db")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path)

        logging.info("Loading covariance file")
        file = Utilities.contentsWithPatternsFromFolder(self.folder_covariance, [".gz"])[0]
        path = os.path.join(self.folder_covariance, file)
        covariance_contents = MatrixUtilities.loadMatrixFromFile(path)

        logging.info("Getting stats")
        results = []
        for gene, entry in covariance_contents.iteritems():
            covariance_matrix = entry[0]
            valid_rsids = entry[1]

            weights = weight_db_logic.weights_by_gene_name[gene]
            weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights)

            w_w = numpy.dot(numpy.transpose(weight_values), weight_values)
            dot_product = numpy.dot(numpy.dot(numpy.transpose(weight_values), covariance_matrix), weight_values)
            det = numpy.linalg.det(covariance_matrix)

            eigenvalues, eigenvectors = numpy.linalg.eigh(covariance_matrix)
            eigenmax = numpy.amax(eigenvalues)
            eigenmin = numpy.amin(eigenvalues)
            n_small = 0
            for eigen in eigenvalues:
                if eigen < 1e-7:
                    n_small += 1
            diag = covariance_matrix.diagonal()
            mean_var = numpy.mean(diag)

            line = (gene, str(len(weight_values)), str(float(dot_product)), str(float(det)), str(float(w_w)), str(float(mean_var)), str(float(eigenmin)), str(float(eigenmax)), str(n_small))
            results.append(line)

#gene, n.snps, WW, W\Gamma W, eig(\Gamma).max, eig(\Gamma).min, #eigs<1e-8, VAR_g, zscore_g
        logging.info("saving results")
        with open(self.output_file, "w") as file:
            header = ",".join(["gene", "m_snp_count", "w_gamma_w", "det", "w_w", "mean_var", "eigenmin", "eigenmax", "n_eigen_e-7"])+"\n"
            file.write(header)
            for line in results:
                text = ",".join(line)+"\n"
                file.write(text)
예제 #48
0
def model_structure(args):
    model = PredictionModel.load_model(args.model_db_path,
                                       args.model_db_snp_key)
    m = {}
    weights, extra = model.weights, model.extra
    if args.sub_batches is not None and args.sub_batch is not None:
        logging.info("slicing models")
        extra = Utilities.sub_batch(extra, args.sub_batches, args.sub_batch)
        weights = weights[weights.gene.isin(extra.gene)].reset_index(drop=True)

    if args.only_entries:
        extra = extra[extra.gene.isin(set(args.only_entries))]
        weights = weights[weights.gene.isin(set(args.only_entries))]

    for i in weights.itertuples():
        if not i.rsid in m:
            m[i.rsid] = (i.non_effect_allele, i.effect_allele, {})
        m[i.rsid][2][i.gene] = i.weight
    return m, weights, extra
예제 #49
0
    def run(self):
        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db_path)

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
예제 #50
0
    def processPrediXcanFiles(self):
        logging.info("Loading people")
        all_people = Person.Person.loadPeople(self.samples_input, '\t', False)
        selected_people = Person.Person.loadPeople(self.samples_output)
        selected_people_by_id = {p.id:p for p in selected_people}
        logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id))

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(self.snp_list)
        snp_dict = {k:True for k in snp_data_set.data}
        print len(snp_dict.keys())

        contents = Utilities.contentsWithPatternsFromFolder(self.dosage_folder, ["dosage.txt.gz"])
        for content_name in contents:
            input_path = os.path.join(self.dosage_folder, content_name)
            fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict)
            if self.output_format == Formats.IMPUTE:
                fileBuilder.buildIMPUTE()
            if self.output_format == Formats.PrediXcan:
                fileBuilder.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
예제 #51
0
    def load(self):
        #print "INFO: Loading dosage files"
        #logging.info("Loading %s dosage", self.path)
        class PrediXcanCollector(object):
            def __init__(self, snps=[], snps_by_rsid={}, weight_db_logic=None):
                self.snps = snps
                self.snps_by_rsid = snps_by_rsid
                self.weight_db_logic = weight_db_logic

            def __call__(self, i, components):
                rsid = components[PDTF.RSID]
                if self.weight_db_logic and not rsid in self.weight_db_logic.genes_for_an_rsid:
                    logging.log(5, "rsid %s not in weight db, skip it", rsid)
                    return

                position = components[PDTF.POSITION]

                ref_allele = components[PDTF.ALLELE_0]
                if not ref_allele in Utilities.VALID_ALLELES:
                    logging.log(9, "wrong ref allele, rsid %s is not an SNP", rsid)
                    return
                eff_allele = components[PDTF.ALLELE_1]
                if not eff_allele in Utilities.VALID_ALLELES:
                    logging.log(9, "wrong eff allele, rsid %s is not an SNP", rsid)
                    return
                dosages = map(float,components[PDTF.FIRST_DATA_COLUMN:]) #dosages may be inputed
                #Should we flip based on weight_db at this point?

                snp = DataSetSNP.DataSetSNP(name=rsid, index=i, data=dosages, position=int(position), ref_allele=ref_allele, eff_allele=eff_allele)
                if snp.name in self.snps_by_rsid:
                    old = self.snps_by_rsid[snp.name]
                    logging.info("Duplicated rsid: (%s,%s) %s", old.name, old.position, " ".join(components))
                self.snps.append(snp)
                self.snps_by_rsid[snp.name] = snp
        loader = Utilities.CSVFileIterator(self.path, compressed=True)
        collector = PrediXcanCollector(weight_db_logic=self.weight_db_logic)
        loader.iterate(collector)
        return collector.snps, collector.snps_by_rsid
예제 #52
0
    def run(self):
        if self.args.weight_db_path:
            logging.info("Loading weight model")
            weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path)
        else:
            weight_db_logic = None

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder, self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        if len(names) == 0:
            raise Exceptions.ReportableException("No GWAS files found on %s with pattern %s" %(self.gwas_folder, self.gwas_regexp.pattern,))

        for name in names:
            try:
                self.buildBetas(weight_db_logic,name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
예제 #53
0
    def buildBetas(self, db_filename):
        filebase = os.path.basename(db_filename).replace(".db", "")
        output_folder = self.args.output_folder
        logging.info("Processing betas for %s" % (db_filename))
        self.args.weight_db_path = os.path.abspath(db_filename)
        self.args.covariance = os.path.join(self.args.covariance_directory, filebase) + ".cov.txt.gz"
        self.args.output_file = os.path.join(self.args.output_directory, filebase) + ".csv"

        logging.info("Loading weight model")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db_path)

        betaScript = M03_betas.GetBetas(self.args)
        names = Utilities.contentsWithRegexpFromFolder(self.args.gwas_folder, betaScript.gwas_regexp)

        if not os.path.exists(self.args.output_folder):
            os.makedirs(self.args.output_folder)
        betaScript.output_folder = os.path.join(output_folder, filebase)
        if not os.path.exists(betaScript.output_folder):
            os.makedirs(betaScript.output_folder)

        for name in names:
            try:
                betaScript.buildBetas(weight_db_logic,name)


            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass

        # ZScores
        logging.info("Calculating ZScores for %s" % (filebase))
        zscoreScript = M04_zscores.CalculateZScores(self.args)
        zscoreScript.folder_beta = betaScript.output_folder
        zscoreScript.run()
예제 #54
0
 def testSamplesInputPath(self):
     path = Utilities.samplesInputPath("tests/_td/dosage_set_1")
     self.assertEqual(path, "tests/_td/dosage_set_1/set.sample")
예제 #55
0
 def testContentsWithRegexpFromFolder(self):
     contents = Utilities.contentsWithRegexpFromFolder("tests/_td/dosage_set_1", re.compile(".*sample"))
     self.assertEqual(contents, ["set.sample"])
예제 #56
0
 def testNamesWithPatternFromFolders(self):
     names = Utilities.namesWithPatternFromFolder("tests/_td/dosage_set_1/", ".sample")
     self.assertEqual(names, ["set"])