class ResultFileReader(object): """Class implementing a parser of the GFam result file, i.e., a text file with the following format: protein_id p-value: GO:XXXXXXX (function) p-value: GO:XXXXXXX (function) ... protein_id2 p-value: ... ... """ def __init__(self, file_name, significance=None): """ Prepares a ResultFileReader to read such from a `file_name`. If we specify a certain `significance`, we will only consider those contents with a pvalue less than this `significance`. """ self.file = IndexedReadOnlyFile(file_name, "^\w+") self.keys = self.file.get_keys() self.pval_regex = re.compile("^\d") self.go_names = dict() if significance is not None: self.alpha = significance else: self.alpha = 1000.0 def get_keys(self): """ Return the set of protein-ids of the file """ return self.keys def get_result_as_dict(self): """ Retrieves the whole dataset as a dictionary. Not recommended if the file is too large. """ d = dict() for key in self.keys: d[key] = self.__getitem__(key) return d def __getitem__(self, key): """ Gets the set of GO terms and p-values for a certain key (which should be a real key in the file). """ list_go_terms = [] for line in [l for l in self.file[key] if self.pval_regex.match(l)]: pvalue, goterm = line.split(' ',2)[0:2] pvalue = float(pvalue[0:-1]) if pvalue < self.alpha: list_go_terms.append((goterm, pvalue)) if goterm not in self.go_names: name = line.split('(', 1)[1][0:-1] self.go_names[goterm] = name return list_go_terms def get_go_names(self): if not self.go_names: for key in self.keys: self.__getitem__(key) return self.go_names
class ResultFileReader(object): """Class implementing a parser of the GFam result file, i.e., a text file with the following format: protein_id p-value: GO:XXXXXXX (function) p-value: GO:XXXXXXX (function) ... protein_id2 p-value: ... ... """ def __init__(self, file_name, significance=None): """ Prepares a ResultFileReader to read such from a `file_name`. If we specify a certain `significance`, we will only consider those contents with a pvalue less than this `significance`. """ self.file = IndexedReadOnlyFile(file_name, r"^\w+") self.keys = self.file.get_keys() self.pval_regex = re.compile(r"^\d") self.go_names = dict() if significance is not None: self.alpha = significance else: self.alpha = 1000.0 def get_keys(self): """ Return the set of protein-ids of the file """ return self.keys def get_result_as_dict(self): """ Retrieves the whole dataset as a dictionary. Not recommended if the file is too large. """ results = dict() for key in self.keys: results[key] = self.__getitem__(key) return results def __getitem__(self, key): """ Gets the set of GO terms and p-values for a certain key (which should be a real key in the file). """ list_go_terms = [] for line in [l for l in self.file[key] if self.pval_regex.match(l)]: pvalue, goterm = line.split(' ', 2)[0:2] pvalue = float(pvalue[0:-1]) if pvalue < self.alpha: list_go_terms.append((goterm, pvalue)) if goterm not in self.go_names: name = line.split('(', 1)[1][0:-1] self.go_names[goterm] = name return list_go_terms def get_go_names(self): if not self.go_names: for key in self.keys: self.__getitem__(key) return self.go_names