def testContentsWithPatternsFromFolders(self): contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["sample", "Fail"]) contents = {c for c in contents} self.assertEqual(contents, set([])) contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["set", "sample"]) contents = {c for c in contents} self.assertEqual(contents, {"set.sample"})
def run(args): logging.info("Loading weight db") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(args.weight_db_path) logging.info("Loading covariance file") covariance_contents = MatrixUtilities.loadMatrixFromFile(args.covariance) logging.info("Choosing method") beta_contents = Utilities.contentsWithPatternsFromFolder(args.beta_folder, []) zscore_calculation, normalization = MethodGuessing.chooseZscoreSchemeFromFiles(args.beta_folder, beta_contents, covariance_contents, weight_db_logic) logging.info("Processing") betas = {} for content in beta_contents: logging.info("Loading betas") beta_path = os.path.join(args.beta_folder, content) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="") beta_sets = {set.name: set for set in beta_sets} betas[content] = beta_sets if args.gene_name: try: gene_data, weights, covariance_matrix, valid_rsids, beta_sets = get_gene_data(args.gene_name, weight_db_logic, covariance_contents, betas) weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights, beta_sets) if args.interactive: embed() logging.info("Processed gene data") except Exception as e: logging.info("Couldn't get gene data") embed()
def processPrediXcanFiles(self): logging.info("Loading people") all_people = Person.Person.loadPeople(self.samples_input, '\t', False) selected_people = Person.Person.loadPeople(self.samples_output) selected_people_by_id = {p.id: p for p in selected_people} logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id)) logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile( self.snp_list) snp_dict = {k: True for k in snp_data_set.data} print len(snp_dict.keys()) contents = Utilities.contentsWithPatternsFromFolder( self.dosage_folder, ["dosage.txt.gz"]) for content_name in contents: input_path = os.path.join(self.dosage_folder, content_name) fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess( input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict) if self.output_format == Formats.IMPUTE: fileBuilder.buildIMPUTE() if self.output_format == Formats.PrediXcan: fileBuilder.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def resultsFromCovarianceFile(self, weight_db_logic): results = {} logging.info("Loading covariance file from %s", self.covariance) covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance) #Keep only covariances present in gene models covariance_contents = {k:v for k,v in covariance_contents.iteritems() if k in weight_db_logic.weights_by_gene} beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, []) zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic) total_entries = len(weight_db_logic.genes_for_an_rsid) snps_found = set() reporter = Utilities.PercentReporter(logging.INFO, total_entries) for beta_name in beta_contents: logging.info("Processing %s", beta_name) beta_path = os.path.join(self.folder_beta, beta_name) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="") beta_sets = {set.name:set for set in beta_sets } key, check = beta_sets.iteritems().next() normalization.update(beta_sets) for gene, entry in covariance_contents.iteritems(): #So, new covariance files might actually have more genes than those in the database if not gene in weight_db_logic.weights_by_gene: logging.log(8, "Gene %s not in weights", gene) continue weights = weight_db_logic.weights_by_gene[gene] present = [rsid for rsid,weight in weights.iteritems() if rsid in check.values_by_key] if len(present) == 0: logging.log(5, "No rsid in beta file for %s", gene) continue if gene in results: logging.info("Gene %s already processed", gene) continue covariance_matrix = entry[0] valid_rsids = entry[1] logging.log(7, "Calculating z score for %s", gene) pre_zscore, n, VAR_g, effect_size = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids) results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g, effect_size) snps_found.update(present) reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study") #second pass, for genes not in any beta file self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation) normalization_constant = normalization.calculateNormalization() return results, normalization_constant
def resultsFromCovarianceFile(self, weight_db_logic): results = {} logging.info("Loading covariance file") covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance) beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, []) zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic) total_entries = len(covariance_contents) reporter = Utilities.PercentReporter(logging.INFO, total_entries) i=0 for beta_name in beta_contents: logging.info("Processing %s", beta_name) beta_path = os.path.join(self.folder_beta, beta_name) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="") beta_sets = {set.name:set for set in beta_sets } key, check = beta_sets.iteritems().next() normalization.update(beta_sets) for gene, entry in covariance_contents.iteritems(): weights = weight_db_logic.weights_by_gene[gene] process = False for rsid, weight in weights.iteritems(): if rsid in check.values_by_key: process = True break if not process: logging.log(5, "No rsid in beta file for %s", gene) continue if gene in results: logging.info("Gene %s already processed", gene) continue reporter.update(i, "%d %% of model's snp information found so far in the gwas study") #proxied by percenteage of genes covariance_matrix = entry[0] valid_rsids = entry[1] logging.log(7, "Calculating z score for %s", gene) pre_zscore, n, VAR_g = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids) results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g) i+=1 #second pass, for genes not in any beta file self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation) normalization_constant = normalization.calculateNormalization() return results, normalization_constant
def run(self): weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db) if self.weight_db else None contents = Utilities.contentsWithPatternsFromFolder(self.data_folder_phase, ["gz"]) if os.path.exists(self.output_file): logging.info("Correlations output already exists, delete it if you want stuff to be figured out again") return dir = os.path.dirname(self.output_file) if not os.path.exists(dir): os.makedirs(dir) for content in contents: self.buildVarianceDB(weight_db_logic,content)
def run(self): weight_db_logic = WeightDBUtilities.WeightDBEntryLogic( self.weight_db) if self.weight_db else None contents = Utilities.contentsWithPatternsFromFolder( self.data_folder_phase, ["gz"]) if os.path.exists(self.output_file): logging.info( "Variance output already exists, delete it if you want stuff to be figured out again" ) return dir = os.path.dirname(self.output_file) if not os.path.exists(dir): os.makedirs(dir) for content in contents: self.buildVarianceDB(weight_db_logic, content)
def run(self): logging.info("Loading weight db") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path) logging.info("Loading covariance file") file = Utilities.contentsWithPatternsFromFolder(self.folder_covariance, [".gz"])[0] path = os.path.join(self.folder_covariance, file) covariance_contents = MatrixUtilities.loadMatrixFromFile(path) logging.info("Getting stats") results = [] for gene, entry in covariance_contents.iteritems(): covariance_matrix = entry[0] valid_rsids = entry[1] weights = weight_db_logic.weights_by_gene_name[gene] weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights) w_w = numpy.dot(numpy.transpose(weight_values), weight_values) dot_product = numpy.dot(numpy.dot(numpy.transpose(weight_values), covariance_matrix), weight_values) det = numpy.linalg.det(covariance_matrix) eigenvalues, eigenvectors = numpy.linalg.eigh(covariance_matrix) eigenmax = numpy.amax(eigenvalues) eigenmin = numpy.amin(eigenvalues) n_small = 0 for eigen in eigenvalues: if eigen < 1e-7: n_small += 1 diag = covariance_matrix.diagonal() mean_var = numpy.mean(diag) line = (gene, str(len(weight_values)), str(float(dot_product)), str(float(det)), str(float(w_w)), str(float(mean_var)), str(float(eigenmin)), str(float(eigenmax)), str(n_small)) results.append(line) #gene, n.snps, WW, W\Gamma W, eig(\Gamma).max, eig(\Gamma).min, #eigs<1e-8, VAR_g, zscore_g logging.info("saving results") with open(self.output_file, "w") as file: header = ",".join(["gene", "m_snp_count", "w_gamma_w", "det", "w_w", "mean_var", "eigenmin", "eigenmax", "n_eigen_e-7"])+"\n" file.write(header) for line in results: text = ",".join(line)+"\n" file.write(text)
def processPrediXcanFiles(self): logging.info("Loading people") all_people = Person.Person.loadPeople(self.samples_input, '\t', False) selected_people = Person.Person.loadPeople(self.samples_output) selected_people_by_id = {p.id:p for p in selected_people} logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id)) logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(self.snp_list) snp_dict = {k:True for k in snp_data_set.data} print len(snp_dict.keys()) contents = Utilities.contentsWithPatternsFromFolder(self.dosage_folder, ["dosage.txt.gz"]) for content_name in contents: input_path = os.path.join(self.dosage_folder, content_name) fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict) if self.output_format == Formats.IMPUTE: fileBuilder.buildIMPUTE() if self.output_format == Formats.PrediXcan: fileBuilder.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def resultsFromCovarianceFile(self, weight_db_logic): results = {} logging.info("Loading covariance file from %s", self.covariance) covariance_contents = MatrixUtilities.loadMatrixFromFile( self.covariance) beta_contents = Utilities.contentsWithPatternsFromFolder( self.folder_beta, []) zscore_calculation, normalization = self.selectMethod( self.folder_beta, beta_contents, covariance_contents, weight_db_logic) total_entries = len(covariance_contents) reporter = Utilities.PercentReporter(logging.INFO, total_entries) i = 0 for beta_name in beta_contents: logging.info("Processing %s", beta_name) beta_path = os.path.join(self.folder_beta, beta_name) beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile( beta_path, header="") beta_sets = {set.name: set for set in beta_sets} key, check = beta_sets.iteritems().next() normalization.update(beta_sets) for gene, entry in covariance_contents.iteritems(): #So, new covariance files might actually have more genes than those in the database if not gene in weight_db_logic.weights_by_gene: logging.log(8, "Gene %s not in weights", gene) continue weights = weight_db_logic.weights_by_gene[gene] process = False for rsid, weight in weights.iteritems(): if rsid in check.values_by_key: process = True break if not process: logging.log(5, "No rsid in beta file for %s", gene) continue if gene in results: logging.info("Gene %s already processed", gene) continue covariance_matrix = entry[0] valid_rsids = entry[1] logging.log(7, "Calculating z score for %s", gene) pre_zscore, n, VAR_g, effect_size = zscore_calculation( gene, weights, beta_sets, covariance_matrix, valid_rsids) results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g, effect_size) i += 1 reporter.update( i, "%d %% of model's snp information found so far in the gwas study" ) # proxied by percenteage of genes #second pass, for genes not in any beta file self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation) normalization_constant = normalization.calculateNormalization() return results, normalization_constant
WEIGHT_DB_PATH = "data/DGN-WB_0.5.db" def loadDosageFile(path): callback = GWASUtilities.GWASSNPInfoLineCollector() dosage_loader = GWASUtilities.GWASDosageFileLoader(path, True, callback) keyed_data_set = dosage_loader.load() return keyed_data_set logging.info("Loading weight db") weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(WEIGHT_DB_PATH) logging.info("Loading covariance file") covariance_contents = MatrixUtilities.loadMatrixFromFile(COV) logging.info("Loading betas") beta_contents = Utilities.contentsWithPatternsFromFolder(BETA, [".gz"]) results = [] for beta_name in beta_contents: logging.info("Processing %s", beta_name) beta_path = os.path.join(BETA, beta_name) beta_data = loadDosageFile(beta_path)[0] for snp, value in beta_data.values_by_key.iteritems(): if not snp in weight_db_logic.genes_for_an_rsid: logging.log(7, "rsid %s not found in DB", snp) continue genes = weight_db_logic.genes_for_an_rsid[snp] if not genes: logging.info("no gene for %s", snp) continue gene = genes[0]