예제 #1
0
def create_parser_from_header(
        line: str,
        config: assocparser.AssocParserConfig,
        group="unknown",
        dataset="unknown",
        bio_entities=None) -> Optional[assocparser.AssocParser]:
    parser = None
    parsed_version = parser_version_regex.findall(line)
    if len(parsed_version) == 1:
        filetype, version, _ = parsed_version[0]
        if filetype in ["gpad", "gpa"]:
            parser = gpadparser.GpadParser(config=config,
                                           bio_entities=bio_entities,
                                           group=group,
                                           dataset=dataset)
            if version in ["1.2", "2.0"]:
                parser.version = version
        elif filetype == "gaf":
            parser = gafparser.GafParser(config=config,
                                         bio_entities=bio_entities,
                                         group=group,
                                         dataset=dataset)
            if version in ["2.1", "2.2"]:
                parser.version = version

    return parser
예제 #2
0
    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            # Save off version info here
            if self.version is None:
                # We are still looking
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.2":
                        logger.info("Detected GAF version 2.2")
                        self.version = version
                    else:
                        logger.info("Detected GAF version {}, so using 2.1".format(version))
                        self.version = self.default_version
                        # Compute the cell component subclass closure
                        self.make_internal_cell_component_closure()

            return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning("No version number found for this file so we will assume GAF version: {}".format(self.default_version))
            self.version = self.default_version
            self.make_internal_cell_component_closure()

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version

        parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser(), bio_entities=self.bio_entities)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        # Qualifier is index 3
        # If we are 2.1, and qualifier has no relation
        # Also must have an ontology
        # Then upgrade
        # For https://github.com/geneontology/go-site/issues/1558
        if self.gaf_version() == "2.1" and (vals[3] == "" or vals[3] == "NOT") and self.config.ontology:
            assoc = self.upgrade_empty_qualifier(assoc)

        ## Run GO Rules, save split values into individual variables
        # print("Config is {}".format(self.config.__dict__.keys()))
        go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "",
                                    msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "",
                                    msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "",
                                    msg="Passing Rule", rule=int(rule.id.split(":")[1]))

        assoc = go_rule_results.annotation  # type: association.GoAssociation
        split_line = assocparser.SplitLine(line=line, values=vals, taxon=str(assoc.object.taxon))

        if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace:
            self.report.warning(line, Report.INVALID_ID, assoc.provided_by,
                "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27)

        db = assoc.subject.id.namespace
        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27)
                assoc.subject.id.namespace = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --assigned_by
        if not self._validate_id(str(assoc.subject.id), split_line, allowed_ids=self.config.entity_idspaces):
            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        # if self.gpi is not None:
        #     entity = self.gpi.get(str(assoc.subject.id), None)
        #     if entity is not None:
        #         assoc.subject.label = entity["symbol"]
        #         assoc.subject.fullname = entity["name"]
        #         assoc.subject.synonyms = entity["synonyms"].split("|")
        #         assoc.subject.type = entity["type"]

        if not self._validate_id(str(assoc.object.id), split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(assoc.object.id))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
        if valid_goid is None:
            return assocparser.ParseResult(line, [], True)
        assoc.object.id = association.Curie.from_str(valid_goid)

        references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line)
        if references is None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        for wf in assoc.evidence.with_support_from:
            validated = self.validate_curie_ids(wf.elements, split_line)
            if validated is None:
                return assocparser.ParseResult(line, [], True)
        with_support_from = self._unroll_withfrom_and_replair_obsoletes(split_line, 'gaf')
        if with_support_from is None:
            return assocparser.ParseResult(line, [], True)
        assoc.evidence.with_support_from = with_support_from
        # validation
        self._validate_symbol(assoc.subject.label, split_line)


        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        valid_taxon = self._validate_taxon(str(assoc.object.taxon), split_line)
        valid_interacting = self._validate_taxon(str(assoc.interacting_taxon), split_line) if assoc.interacting_taxon else True
        if not valid_taxon:
            self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.object.taxon), "Taxon ID is invalid", rule=27)
        if not valid_interacting:
            self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27)
        if (not valid_taxon) or (not valid_interacting):
            return assocparser.ParseResult(line, [], True)

        return assocparser.ParseResult(line, [assoc], False, vals[6])
예제 #3
0
    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            if self.version is None:
                # We are still looking
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.0":
                        logger.info("Detected GPAD version 2.0")
                        self.version = version
                    else:
                        logger.info(
                            "Detected GPAD version {}, so defaulting to 1.2".
                            format(version))
                        self.version = self.default_version

            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning(
                "No version number found for this file so we will assume GPAD version: {}"
                .format(self.default_version))
            self.version = self.default_version

        vals = [el.strip() for el in line.split("\t")]

        parsed = to_association(list(vals),
                                report=self.report,
                                version=self.gpad_version(),
                                bio_entities=self.bio_entities)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        go_rule_results = qc.test_go_rules(assoc, self.config)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        assoc = go_rule_results.annotation  # type: association.GoAssociation

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        if not self._validate_id(
                str(assoc.subject.id), split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(
                str(assoc.object.id), split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(str(assoc.object.id),
                                                      split_line)
        if valid_goid is None:
            return assocparser.ParseResult(line, [], True)
        assoc.object.id = association.Curie.from_str(valid_goid)

        if not self._validate_id(str(assoc.evidence.type), split_line):
            return assocparser.ParseResult(line, [], True)

        if assoc.interacting_taxon:
            if not self._validate_taxon(str(assoc.interacting_taxon),
                                        split_line):
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  str(assoc.interacting_taxon),
                                  "Taxon ID is invalid",
                                  rule=27)
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        # Reference Column
        references = self.validate_curie_ids(
            assoc.evidence.has_supporting_reference, split_line)
        if references is None:
            return assocparser.ParseResult(line, [], True)

        # With/From
        for wf in assoc.evidence.with_support_from:
            validated = self.validate_curie_ids(wf.elements, split_line)
            if validated is None:
                return assocparser.ParseResult(line, [], True)

        return assocparser.ParseResult(line, [assoc], False)
예제 #4
0
    def parse_line(self, line):
        """Parses a single line of a GPI.

        Return a tuple `(processed_line, entities)`. Typically
        there will be a single entity, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """

        if self.is_header(line):
            if self.version is None:
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.0":
                        logger.info("Detected GPI version 2.0")
                        self.version = version
                    else:
                        logger.info("Detected version {}, so using 1.2".format(
                            version))
                        self.version = self.default_version

            return (line, [{"header": True, "line": line.strip()}])

        if self.version is None:
            logger.warning(
                "No version number found for this file so we will assum GPI version: {}"
                .format(self.default_version))
            self.version = self.default_version

        vals = line.split("\t")

        if len(vals) < 7:
            self.report.error(line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                              "")
            return line, []

        # If we are 1.2, then we can upconvert into a 2.0 "line", and validate from there
        if self.gpi_version() == "1.2":

            if len(vals) < 10 and len(vals) >= 7:
                missing_columns = 10 - len(vals)
                vals += ["" for i in range(missing_columns)]
            # Convert a 1.2 set of values to a 2.0 set of values
            vals = self.line_as_2_0(vals)
        else:
            # We are gpi 2.0
            if len(vals) < 11 and len(vals) >= 7:
                missing_columns = 11 - len(vals)
                vals += ["" for i in range(missing_columns)]

        vals = [el.strip() for el in vals]

        # End Housekeeping
        #=================================================================

        [
            object_id, db_object_symbol, db_object_name, synonyms,
            entity_types, taxon, encoded_by, parents,
            contained_complex_members, xrefs, properties
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon)

        ## --
        ## db + db_object_id. CARD=1
        ## --
        if not self._validate_id(object_id, split_line):
            return line, []

        fullnames = self.list_field(db_object_name)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = self.list_field(synonyms)

        types = self.list_field(entity_types)

        encoded_by = self.list_field(encoded_by)
        for encoded in encoded_by:
            self._validate_id(encoded, split_line)

        parents = [self._normalize_id(x) for x in self.list_field(parents)]
        for p in parents:
            self._validate_id(p, split_line)

        contained_complex_members = self.list_field(contained_complex_members)
        for members in contained_complex_members:
            self._validate_id(members, split_line)

        xref_ids = self.list_field(xrefs)

        obj = {
            'id': object_id,
            'label': db_object_symbol,
            'full_name': fullnames,
            'synonyms': synonyms,
            'type': types,
            'parents': parents,
            'encoded_by': encoded_by,
            'contained_complex_members': contained_complex_members,
            'xrefs': xref_ids,
            'taxon': {
                'id': self._taxon_id(taxon, split_line)
            },
            'properties': properties
        }
        return line, [obj]