def create_parser_from_header( line: str, config: assocparser.AssocParserConfig, group="unknown", dataset="unknown", bio_entities=None) -> Optional[assocparser.AssocParser]: parser = None parsed_version = parser_version_regex.findall(line) if len(parsed_version) == 1: filetype, version, _ = parsed_version[0] if filetype in ["gpad", "gpa"]: parser = gpadparser.GpadParser(config=config, bio_entities=bio_entities, group=group, dataset=dataset) if version in ["1.2", "2.0"]: parser.version = version elif filetype == "gaf": parser = gafparser.GafParser(config=config, bio_entities=bio_entities, group=group, dataset=dataset) if version in ["2.1", "2.2"]: parser.version = version return parser
def parse_line(self, line): """ Parses a single line of a GAF Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GAF file """ # Returns assocparser.ParseResult parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): # Save off version info here if self.version is None: # We are still looking parsed = parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.2": logger.info("Detected GAF version 2.2") self.version = version else: logger.info("Detected GAF version {}, so using 2.1".format(version)) self.version = self.default_version # Compute the cell component subclass closure self.make_internal_cell_component_closure() return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) # At this point, we should have gone through all the header, and a version number should be established if self.version is None: logger.warning("No version number found for this file so we will assume GAF version: {}".format(self.default_version)) self.version = self.default_version self.make_internal_cell_component_closure() vals = [el.strip() for el in line.split("\t")] # GAF v1 is defined as 15 cols, GAF v2 as 17. # We treat everything as GAF2 by adding two blank columns. # TODO: check header metadata to see if columns corresponds to declared dataformat version parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser(), bio_entities=self.bio_entities) if parsed.associations == []: return parsed assoc = parsed.associations[0] # Qualifier is index 3 # If we are 2.1, and qualifier has no relation # Also must have an ontology # Then upgrade # For https://github.com/geneontology/go-site/issues/1558 if self.gaf_version() == "2.1" and (vals[3] == "" or vals[3] == "NOT") and self.config.ontology: assoc = self.upgrade_empty_qualifier(assoc) ## Run GO Rules, save split values into individual variables # print("Config is {}".format(self.config.__dict__.keys())) go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) assoc = go_rule_results.annotation # type: association.GoAssociation split_line = assocparser.SplitLine(line=line, values=vals, taxon=str(assoc.object.taxon)) if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace: self.report.warning(line, Report.INVALID_ID, assoc.provided_by, "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27) db = assoc.subject.id.namespace if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces: # Are we a synonym? upgrade = self.config.entity_idspaces.reverse(db) if upgrade is not None: # If we found a synonym self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27) assoc.subject.id.namespace = upgrade ## -- ## db + db_object_id. CARD=1 ## --assigned_by if not self._validate_id(str(assoc.subject.id), split_line, allowed_ids=self.config.entity_idspaces): return assocparser.ParseResult(line, [], True) # Using a given gpi file to validate the gene object # if self.gpi is not None: # entity = self.gpi.get(str(assoc.subject.id), None) # if entity is not None: # assoc.subject.label = entity["symbol"] # assoc.subject.fullname = entity["name"] # assoc.subject.synonyms = entity["synonyms"].split("|") # assoc.subject.type = entity["type"] if not self._validate_id(str(assoc.object.id), split_line, context=ANNOTATION): print("skipping because {} not validated!".format(assoc.object.id)) return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line) if valid_goid is None: return assocparser.ParseResult(line, [], True) assoc.object.id = association.Curie.from_str(valid_goid) references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line) if references is None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # With/From for wf in assoc.evidence.with_support_from: validated = self.validate_curie_ids(wf.elements, split_line) if validated is None: return assocparser.ParseResult(line, [], True) with_support_from = self._unroll_withfrom_and_replair_obsoletes(split_line, 'gaf') if with_support_from is None: return assocparser.ParseResult(line, [], True) assoc.evidence.with_support_from = with_support_from # validation self._validate_symbol(assoc.subject.label, split_line) ## -- ## taxon CARD={1,2} ## -- ## if a second value is specified, this is the interacting taxon ## We do not use the second value valid_taxon = self._validate_taxon(str(assoc.object.taxon), split_line) valid_interacting = self._validate_taxon(str(assoc.interacting_taxon), split_line) if assoc.interacting_taxon else True if not valid_taxon: self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.object.taxon), "Taxon ID is invalid", rule=27) if not valid_interacting: self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27) if (not valid_taxon) or (not valid_interacting): return assocparser.ParseResult(line, [], True) return assocparser.ParseResult(line, [assoc], False, vals[6])
def parse_line(self, line): """Parses a single line of a GPAD. Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): if self.version is None: # We are still looking parsed = parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.0": logger.info("Detected GPAD version 2.0") self.version = version else: logger.info( "Detected GPAD version {}, so defaulting to 1.2". format(version)) self.version = self.default_version return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) # At this point, we should have gone through all the header, and a version number should be established if self.version is None: logger.warning( "No version number found for this file so we will assume GPAD version: {}" .format(self.default_version)) self.version = self.default_version vals = [el.strip() for el in line.split("\t")] parsed = to_association(list(vals), report=self.report, version=self.gpad_version(), bio_entities=self.bio_entities) if parsed.associations == []: return parsed assoc = parsed.associations[0] go_rule_results = qc.test_go_rules(assoc, self.config) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format( id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) assoc = go_rule_results.annotation # type: association.GoAssociation split_line = assocparser.SplitLine(line=line, values=vals, taxon="") if not self._validate_id( str(assoc.subject.id), split_line, context=ENTITY): return assocparser.ParseResult(line, [], True) if not self._validate_id( str(assoc.object.id), split_line, context=ANNOTATION): return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line) if valid_goid is None: return assocparser.ParseResult(line, [], True) assoc.object.id = association.Curie.from_str(valid_goid) if not self._validate_id(str(assoc.evidence.type), split_line): return assocparser.ParseResult(line, [], True) if assoc.interacting_taxon: if not self._validate_taxon(str(assoc.interacting_taxon), split_line): self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27) return assocparser.ParseResult(line, [], True) #TODO: ecomap is currently one-way only #ecomap = self.config.ecomap #if ecomap != None: # if ecomap.ecoclass_to_coderef(evidence) == (None,None): # self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence, # msg="Expecting a known ECO class ID") # Reference Column references = self.validate_curie_ids( assoc.evidence.has_supporting_reference, split_line) if references is None: return assocparser.ParseResult(line, [], True) # With/From for wf in assoc.evidence.with_support_from: validated = self.validate_curie_ids(wf.elements, split_line) if validated is None: return assocparser.ParseResult(line, [], True) return assocparser.ParseResult(line, [assoc], False)
def parse_line(self, line): """Parses a single line of a GPI. Return a tuple `(processed_line, entities)`. Typically there will be a single entity, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GPAD file """ if self.is_header(line): if self.version is None: parsed = parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.0": logger.info("Detected GPI version 2.0") self.version = version else: logger.info("Detected version {}, so using 1.2".format( version)) self.version = self.default_version return (line, [{"header": True, "line": line.strip()}]) if self.version is None: logger.warning( "No version number found for this file so we will assum GPI version: {}" .format(self.default_version)) self.version = self.default_version vals = line.split("\t") if len(vals) < 7: self.report.error(line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "") return line, [] # If we are 1.2, then we can upconvert into a 2.0 "line", and validate from there if self.gpi_version() == "1.2": if len(vals) < 10 and len(vals) >= 7: missing_columns = 10 - len(vals) vals += ["" for i in range(missing_columns)] # Convert a 1.2 set of values to a 2.0 set of values vals = self.line_as_2_0(vals) else: # We are gpi 2.0 if len(vals) < 11 and len(vals) >= 7: missing_columns = 11 - len(vals) vals += ["" for i in range(missing_columns)] vals = [el.strip() for el in vals] # End Housekeeping #================================================================= [ object_id, db_object_symbol, db_object_name, synonyms, entity_types, taxon, encoded_by, parents, contained_complex_members, xrefs, properties ] = vals split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon) ## -- ## db + db_object_id. CARD=1 ## -- if not self._validate_id(object_id, split_line): return line, [] fullnames = self.list_field(db_object_name) ## -- ## db_object_synonym CARD=0..* ## -- synonyms = self.list_field(synonyms) types = self.list_field(entity_types) encoded_by = self.list_field(encoded_by) for encoded in encoded_by: self._validate_id(encoded, split_line) parents = [self._normalize_id(x) for x in self.list_field(parents)] for p in parents: self._validate_id(p, split_line) contained_complex_members = self.list_field(contained_complex_members) for members in contained_complex_members: self._validate_id(members, split_line) xref_ids = self.list_field(xrefs) obj = { 'id': object_id, 'label': db_object_symbol, 'full_name': fullnames, 'synonyms': synonyms, 'type': types, 'parents': parents, 'encoded_by': encoded_by, 'contained_complex_members': contained_complex_members, 'xrefs': xref_ids, 'taxon': { 'id': self._taxon_id(taxon, split_line) }, 'properties': properties } return line, [obj]