def processPrediXcanFiles(self): logging.info("Loading people") all_people = Person.Person.loadPeople(self.samples_input, '\t', False) selected_people = Person.Person.loadPeople(self.samples_output) selected_people_by_id = {p.id: p for p in selected_people} logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id)) logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile( self.snp_list) snp_dict = {k: True for k in snp_data_set.data} print len(snp_dict.keys()) contents = Utilities.contentsWithPatternsFromFolder( self.dosage_folder, ["dosage.txt.gz"]) for content_name in contents: input_path = os.path.join(self.dosage_folder, content_name) fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess( input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict) if self.output_format == Formats.IMPUTE: fileBuilder.buildIMPUTE() if self.output_format == Formats.PrediXcan: fileBuilder.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)
def processIMPUTEFiles(self): logging.info("Loading people") names = Utilities.hapNamesFromFolder(self.dosage_folder) all_people = Person.Person.loadPeople(self.samples_input) selected_people = Person.Person.loadPeople(self.samples_output, delim=" ") selected_people_by_id = {p.id: p for p in selected_people} logging.info("Loading snps") snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile( self.snp_list) snp_dict = {rsid: True for rsid in snp_data_set.data} for name in names: output = os.path.join(self.output_folder, name) filter = ThousandGenomesUtilities.IMPUTEFilteredDosageFileBuilder() filter.base_path = self.dosage_folder filter.name = name filter.output_pattern = output filter.snp_dict = snp_dict filter.all_people = all_people filter.selected_people_by_id = selected_people_by_id if self.output_format == Formats.IMPUTE: filter.buildIMPUTE() elif self.output_format == Formats.PrediXcan: search = self.chromosome_in_name_regex.search(name) exitIf(search is None, Exceptions.InvalidInputFormat, \ "No files found in '%s' that match the pattern, '%s'" \ % (self.dosage_folder, self.chromosome_in_name_regex.pattern)) chr = search.group(1) filter.chromosome_name = chr filter.buildPrediXcan() else: raise Exceptions.InvalidOutputFormat(self.output_format)