예제 #1
0
def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=assocparser.Qualifier2_1(), bio_entities=None) -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    bio_entities = collections.BioEntities(dict()) if bio_entities is None else bio_entities
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "",
            msg="There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "",
            msg="There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)".format(columns=len(gaf_line)), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    parsed_taxons_result = gaf_line_validators["taxon"].validate(gaf_line[TAXON_INDEX])  # type: assocparser.ValidateResult
    if not parsed_taxons_result.valid:
        report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = parsed_taxons_result.parsed[0]

    date = assocparser.parse_date(gaf_line[13], report, source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None
    subject_curie = association.Curie(gaf_line[0], gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon)
    gpi_entity = bio_entities.get(subject_curie)
    if gpi_entity is not None and subject != gpi_entity:
        subject = gpi_entity

    # column 4 is qualifiers -> index 3
    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    # We use the below validate to check validaty if qualifiers, not as much to *parse* them into the GoAssociation object.
    # For GoAssociation we will use the above qualifiers list. This is fine because the above does not include `NOT`, etc
    # This is confusing, and we can fix later on by consolidating qualifier and relation in GoAssociation.
    parsed_qualifiers = qualifier_parser.validate(gaf_line[3])
    if not parsed_qualifiers.valid:
        report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    aspect = gaf_line[8]
    negated, relation_label, qualifiers = assocparser._parse_qualifier(gaf_line[3], aspect)
    # Note: Relation label is grabbed from qualifiers, if any exist in _parse_qualifier
    qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers]

    object = association.Term(association.Curie.from_str(gaf_line[4]), taxon)
    if isinstance(object, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1)

    # References
    references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e]
    for r in references:
        if isinstance(r, association.Error):
            report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], "Problem parsing references", taxon=gaf_line[TAXON_INDEX], rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)

    gorefs = [ref for ref in references if ref.namespace == "GO_REF"] + [None]
    eco_curie = ecomap.coderef_to_ecoclass(gaf_line[6], reference=gorefs[0])
    if eco_curie is None:
        report.error(source_line, Report.UNKNOWN_EVIDENCE_CLASS, gaf_line[6], msg="Expecting a known ECO GAF code, e.g ISS", rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(gaf_line[7])
    if isinstance(withfroms, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[7], "Problem parsing with/from", taxon=gaf_line[TAXON_INDEX], rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence_type = association.Curie.from_str(eco_curie)
    if isinstance(evidence_type, association.Error):
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[6], "Problem parsing evidence type", taxon=gaf_line[TAXON_INDEX], rule=1)

    evidence = association.Evidence(association.Curie.from_str(eco_curie), references, withfroms)
    if any([isinstance(e, association.Error) for e in evidence.has_supporting_reference]):
        first_error = [e for e in evidence.has_supporting_reference if isinstance(e, association.Error)][0]
        report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], first_error.info, taxon=str(taxon), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    subject_extensions = []
    if gaf_line[16]:
        subject_filler = association.Curie.from_str(gaf_line[16])
        if isinstance(subject_filler, association.Error):
            report.error(source_line, assocparser.Report.INVALID_ID, gaf_line[16], subject_filler.info, taxon=str(taxon), rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)
        # filler is not an Error, so keep moving
        subject_extensions.append(association.ExtensionUnit(association.Curie.from_str("rdfs:subClassOf"), subject_filler))

    conjunctions = []
    if gaf_line[15]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gaf_line[15],
            conjunct_element_builder=lambda el: association.ExtensionUnit.from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie) and relation should have corresponding URI", taxon=str(taxon), rule=1)
            return assocparser.ParseResult(source_line, [], True, report=report)

    relation_uri = relations.lookup_label(relation_label)
    if relation_uri is None:
        report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation_label, "Could not find CURIE for relation `{}`".format(relation_label), taxon=str(taxon), rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    # We don't have to check that this is well formed because we're grabbing it from the known relations URI map.
    relation_curie = association.Curie.from_str(curie_util.contract_uri(relation_uri)[0])

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=relation_curie,
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=conjunctions,
        provided_by=gaf_line[14],
        date=date,
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)
예제 #2
0
    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version
        if 17 > len(vals) >= 15:
            vals += [""] * (17 - len(vals))

        if len(vals) > 17:
            # If we see more than 17 columns, we will just cut off the columns after column 17
            self.report.warning(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
                rule=1)
            vals = vals[:17]

        if len(vals) != 17:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
                .format(columns=len(vals)),
                rule=1)
            return assocparser.ParseResult(line, [], True)

        [
            db, db_object_id, db_object_symbol, qualifier, goid, reference,
            evidence, withfrom, aspect, db_object_name, db_object_synonym,
            db_object_type, taxon, date, assigned_by, annotation_xp,
            gene_product_isoform
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon)

        ## check for missing columns
        if db == "":
            self.report.error(line,
                              Report.INVALID_IDSPACE,
                              "EMPTY",
                              "col1 is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)
        if db_object_id == "":
            self.report.error(line,
                              Report.INVALID_ID,
                              "EMPTY",
                              "col2 is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)
        if taxon == "":
            self.report.error(line,
                              Report.INVALID_TAXON,
                              "EMPTY",
                              "taxon column is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)
        if reference == "":
            self.report.error(line,
                              Report.INVALID_ID,
                              "EMPTY",
                              "reference column 6 is empty",
                              taxon=taxon,
                              rule=1)
            return assocparser.ParseResult(line, [], True)

        if self.config.group_idspace is not None and assigned_by not in self.config.group_idspace:
            self.report.warning(
                line,
                Report.INVALID_ID,
                assigned_by,
                "GORULE:0000027: assigned_by is not present in groups reference",
                taxon=taxon,
                rule=27)

        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(
                    line,
                    Report.INVALID_ID_DBXREF,
                    db,
                    "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated"
                    .format(db, upgrade),
                    taxon=taxon,
                    rule=27)
                db = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(
                id, split_line, allowed_ids=self.config.entity_idspaces):

            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        if self.gpi is not None:
            entity = self.gpi.get(id, None)
            if entity is not None:
                db_object_symbol = entity["symbol"]
                db_object_name = entity["name"]
                db_object_synonym = entity["synonyms"]
                db_object_type = entity["type"]

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(goid))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        date = self._normalize_gaf_date(date, split_line)
        if date == None:
            return assocparser.ParseResult(line, [], True)

        vals[13] = date

        ecomap = self.config.ecomap
        if ecomap is not None:
            if ecomap.coderef_to_ecoclass(evidence, reference) is None:
                self.report.error(
                    line,
                    assocparser.Report.UNKNOWN_EVIDENCE_CLASS,
                    evidence,
                    msg="Expecting a known ECO GAF code, e.g ISS",
                    rule=1)
                return assocparser.ParseResult(line, [], True)

        # Throw out the line if it uses GO_REF:0000033, see https://github.com/geneontology/go-site/issues/563#event-1519351033
        if "GO_REF:0000033" in reference.split("|"):
            self.report.error(
                line,
                assocparser.Report.INVALID_ID,
                reference,
                msg=
                "Disallowing GO_REF:0000033 in reference field as of 03/13/2018",
                rule=30)
            return assocparser.ParseResult(line, [], True)

        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        references = self.normalize_refs(references, split_line)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # validation
        self._validate_symbol(db_object_symbol, split_line)

        # Example use case: mapping from UniProtKB to MOD ID
        if self.config.entity_map is not None:
            id = self.map_id(id, self.config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        if goid.startswith("GO:") and aspect.upper() not in ["C", "F", "P"]:
            self.report.error(line,
                              assocparser.Report.INVALID_ASPECT,
                              aspect,
                              rule=28)
            return assocparser.ParseResult(line, [], True)

        go_rule_results = qc.test_go_rules(vals, self.config)
        for rule_id, result in go_rule_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    goid,
                                    msg="{id}: {message}".format(
                                        id=rule_id, message=result.message),
                                    rule=int(rule_id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  goid,
                                  msg="{id}: {message}".format(
                                      id=rule_id, message=result.message),
                                  rule=int(rule_id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        taxons = taxon.split("|")
        normalized_taxon = self._taxon_id(taxons[0], split_line)
        if normalized_taxon == None:
            self.report.error(line,
                              assocparser.Report.INVALID_TAXON,
                              taxon,
                              msg="Taxon ID is invalid")
            return assocparser.ParseResult(line, [], True)

        self._validate_taxon(normalized_taxon, split_line)

        interacting_taxon = None
        if len(taxons) == 2:
            interacting_taxon = self._taxon_id(taxons[1], split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  taxon,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## parse annotation extension
        ## See appendix in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, aspect)

        ## --
        ## goid
        ## --
        # TODO We shouldn't overload buildin keywords/functions
        object = {'id': goid, 'taxon': normalized_taxon}

        # construct subject dict
        subject = {
            'id': id,
            'label': db_object_symbol,
            'type': db_object_type,
            'fullname': db_object_name,
            'synonyms': synonyms,
            'taxon': {
                'id': normalized_taxon
            }
        }

        ## --
        ## gene_product_isoform
        ## --
        ## This is mapped to a more generic concept of subject_extensions
        subject_extns = []
        if gene_product_isoform is not None and gene_product_isoform != '':
            subject_extns.append({
                'property': 'isoform',
                'filler': gene_product_isoform
            })

        object_extensions = {}
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            object_extensions['union_of'] = object_or_exprs

        ## --
        ## evidence
        ## reference
        ## withfrom
        ## --
        evidence_obj = {
            'type': evidence,
            'has_supporting_reference': references,
            'with_support_from': withfroms
        }

        ## Construct main return dict
        assoc = {
            'source_line': line,
            'subject': subject,
            'object': object,
            'negated': negated,
            'qualifiers': other_qualifiers,
            'aspect': aspect,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': evidence_obj,
            'provided_by': assigned_by,
            'date': date,
            'subject_extensions': subject_extns,
            'object_extensions': object_extensions
        }

        return assocparser.ParseResult(line, [assoc], False, evidence.upper())
예제 #3
0
    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version
        if 17 > len(vals) >= 15:
            vals += [""] * (17 - len(vals))

        if len(vals) != 17:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
                .format(columns=len(vals)))
            return assocparser.ParseResult(line, [], True)

        [
            db, db_object_id, db_object_symbol, qualifier, goid, reference,
            evidence, withfrom, aspect, db_object_name, db_object_synonym,
            db_object_type, taxon, date, assigned_by, annotation_xp,
            gene_product_isoform
        ] = vals

        ## --
        ## db + db_object_id. CARD=1
        ## --
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, line, ENTITY):
            print("skipping cause {} not validated!".format(id))
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, line, ANNOTATION):
            print("skipping cause {} not validated!".format(goid))
            return assocparser.ParseResult(line, [], True)

        date = self._normalize_gaf_date(date, line)

        ecomap = self.config.ecomap
        if ecomap != None:
            if ecomap.coderef_to_ecoclass(evidence, reference) is None:
                self.report.error(
                    line,
                    assocparser.Report.UNKNOWN_EVIDENCE_CLASS,
                    evidence,
                    msg="Expecting a known ECO GAF code, e.g ISS")
                return assocparser.ParseResult(line, [], True)

        # validation
        self._validate_symbol(db_object_symbol, line)

        # Example use case: mapping from UniProtKB to MOD ID
        if self.config.entity_map is not None:
            id = self.map_id(id, self.config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        taxa = [self._taxon_id(x) for x in taxon.split("|")]
        taxon = taxa[0]
        in_taxa = taxa[1:]
        self._validate_taxon(taxon, line)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## process associations
        ## --
        ## note that any disjunctions in the annotation extension
        ## will result in the generation of multiple associations
        assocs = []
        xp_ors = annotation_xp.split("|")
        for xp_or in xp_ors:

            # gather conjunctive expressions in extensions field
            xp_ands = xp_or.split(",")
            extns = []
            for xp_and in xp_ands:
                if xp_and != "":
                    expr = self._parse_relationship_expression(xp_and,
                                                               line=line)
                    if expr is not None:
                        extns.append(expr)

            ## --
            ## qualifier
            ## --
            negated, relation, other_qualifiers = self._parse_qualifier(
                qualifier, aspect)

            ## --
            ## goid
            ## --
            # TODO We shouldn't overload buildin keywords/functions
            object = {'id': goid, 'taxon': taxon}

            # construct subject dict
            subject = {
                'id': id,
                'label': db_object_symbol,
                'type': db_object_type,
                'fullname': db_object_name,
                'synonyms': synonyms,
                'taxon': {
                    'id': taxon
                }
            }

            ## --
            ## gene_product_isoform
            ## --
            ## This is mapped to a more generic concept of subject_extensions
            subject_extns = []
            if gene_product_isoform is not None and gene_product_isoform != '':
                subject_extns.append({
                    'property': 'isoform',
                    'filler': gene_product_isoform
                })

            ## --
            ## evidence
            ## reference
            ## withfrom
            ## --
            evidence_obj = {
                'type': evidence,
                'has_supporting_reference': self._split_pipe(reference)
            }
            evidence_obj['with_support_from'] = self._split_pipe(withfrom)

            ## Construct main return dict
            assoc = {
                'source_line': line,
                'subject': subject,
                'object': object,
                'negated': negated,
                'qualifiers': other_qualifiers,
                'aspect': aspect,
                'relation': {
                    'id': relation
                },
                'evidence': evidence_obj,
                'provided_by': assigned_by,
                'date': date,
            }
            if len(subject_extns) > 0:
                assoc['subject_extensions'] = subject_extns
            if len(extns) > 0:
                assoc['object_extensions'] = extns

            self._validate_assoc(assoc, line)

            assocs.append(assoc)

        return assocparser.ParseResult(line, assocs, False, evidence.upper())
예제 #4
0
    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            # Save off version info here
            if self.version is None:
                # We are still looking
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.2":
                        logger.info("Detected GAF version 2.2")
                        self.version = version
                    else:
                        logger.info("Detected GAF version {}, so using 2.1".format(version))
                        self.version = self.default_version
                        # Compute the cell component subclass closure
                        self.make_internal_cell_component_closure()

            return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning("No version number found for this file so we will assume GAF version: {}".format(self.default_version))
            self.version = self.default_version
            self.make_internal_cell_component_closure()

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version

        parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser(), bio_entities=self.bio_entities)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        # Qualifier is index 3
        # If we are 2.1, and qualifier has no relation
        # Also must have an ontology
        # Then upgrade
        # For https://github.com/geneontology/go-site/issues/1558
        if self.gaf_version() == "2.1" and (vals[3] == "" or vals[3] == "NOT") and self.config.ontology:
            assoc = self.upgrade_empty_qualifier(assoc)

        ## Run GO Rules, save split values into individual variables
        # print("Config is {}".format(self.config.__dict__.keys()))
        go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "",
                                    msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "",
                                    msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "",
                                    msg="Passing Rule", rule=int(rule.id.split(":")[1]))

        assoc = go_rule_results.annotation  # type: association.GoAssociation
        split_line = assocparser.SplitLine(line=line, values=vals, taxon=str(assoc.object.taxon))

        if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace:
            self.report.warning(line, Report.INVALID_ID, assoc.provided_by,
                "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27)

        db = assoc.subject.id.namespace
        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27)
                assoc.subject.id.namespace = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --assigned_by
        if not self._validate_id(str(assoc.subject.id), split_line, allowed_ids=self.config.entity_idspaces):
            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        # if self.gpi is not None:
        #     entity = self.gpi.get(str(assoc.subject.id), None)
        #     if entity is not None:
        #         assoc.subject.label = entity["symbol"]
        #         assoc.subject.fullname = entity["name"]
        #         assoc.subject.synonyms = entity["synonyms"].split("|")
        #         assoc.subject.type = entity["type"]

        if not self._validate_id(str(assoc.object.id), split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(assoc.object.id))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
        if valid_goid is None:
            return assocparser.ParseResult(line, [], True)
        assoc.object.id = association.Curie.from_str(valid_goid)

        references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line)
        if references is None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        for wf in assoc.evidence.with_support_from:
            validated = self.validate_curie_ids(wf.elements, split_line)
            if validated is None:
                return assocparser.ParseResult(line, [], True)
        with_support_from = self._unroll_withfrom_and_replair_obsoletes(split_line, 'gaf')
        if with_support_from is None:
            return assocparser.ParseResult(line, [], True)
        assoc.evidence.with_support_from = with_support_from
        # validation
        self._validate_symbol(assoc.subject.label, split_line)


        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        valid_taxon = self._validate_taxon(str(assoc.object.taxon), split_line)
        valid_interacting = self._validate_taxon(str(assoc.interacting_taxon), split_line) if assoc.interacting_taxon else True
        if not valid_taxon:
            self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.object.taxon), "Taxon ID is invalid", rule=27)
        if not valid_interacting:
            self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27)
        if (not valid_taxon) or (not valid_interacting):
            return assocparser.ParseResult(line, [], True)

        return assocparser.ParseResult(line, [assoc], False, vals[6])
예제 #5
0
def from_2_0(gpad_line: List[str],
             report=None,
             group="unknown",
             dataset="unknown",
             bio_entities=None):
    source_line = "\t".join(gpad_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    SUBJECT_CURIE = 0
    RELATION = 2
    ONTOLOGY_CLASS_INDEX = 3
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    DATE_INDEX = 8
    ASSIGNED_BY_INDEX = 9
    required = [
        SUBJECT_CURIE, RELATION, ONTOLOGY_CLASS_INDEX, REFERENCE_INDEX,
        EVIDENCE_INDEX, DATE_INDEX, ASSIGNED_BY_INDEX
    ]
    for req in required:
        if gpad_line[req] == "":
            report.error(source_line,
                         Report.INVALID_ID,
                         "EMPTY",
                         "Column {} is empty".format(req + 1),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    taxon = association.Curie("NCBITaxon", "0")
    subject_curie = association.Curie.from_str(gpad_line[SUBJECT_CURIE])
    if subject_curie.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[SUBJECT_CURIE],
                     "Problem parsing DB Object",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    subject = association.Subject(subject_curie, "", "", [], "", taxon)
    entity = bio_entities.get(subject_curie)
    if entity is not None:
        # If we found a subject entity, then set `subject` to the found entity
        subject = entity
        taxon = subject.taxon

    negated = gpad_line[1] == "NOT"

    relation = association.Curie.from_str(gpad_line[RELATION])
    if relation.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[RELATION],
                     "Problem parsing Relation",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    go_term = association.Curie.from_str(gpad_line[ONTOLOGY_CLASS_INDEX])
    if go_term.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[ONTOLOGY_CLASS_INDEX],
                     "Problem parsing GO Term",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(go_term, taxon)

    evidence_type = association.Curie.from_str(gpad_line[EVIDENCE_INDEX])
    if evidence_type.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[EVIDENCE_INDEX],
                     "Problem parsing Evidence ECO Curie",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    references = [
        association.Curie.from_str(e)
        for e in gpad_line[REFERENCE_INDEX].split("|") if e
    ]
    for r in references:
        if r.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[REFERENCE_INDEX],
                         "Problem parsing references",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(
        gpad_line[6])  # Returns a list of ConjuctiveSets or Error
    if isinstance(withfroms, association.Error):
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[6],
                     "Problem parsing With/From column",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence = association.Evidence(evidence_type, references, withfroms)

    interacting_taxon = None
    if gpad_line[7] != "":
        interacting_taxon = association.Curie.from_str(gpad_line[7])
        if interacting_taxon.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[7],
                         "Problem parsing Interacting Taxon",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    date = assocparser.parse_iso_date(gpad_line[DATE_INDEX], report,
                                      source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    conjunctions = []
    # The elements of the extension units are Curie(Curie)
    if gpad_line[10]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gpad_line[10],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_curie_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    properties_list = association.parse_annotation_properties(gpad_line[11])

    a = association.GoAssociation(source_line=source_line,
                                  subject=subject,
                                  relation=relation,
                                  object=object,
                                  negated=negated,
                                  qualifiers=[relation],
                                  aspect=None,
                                  interacting_taxon=interacting_taxon,
                                  evidence=evidence,
                                  subject_extensions=[],
                                  object_extensions=conjunctions,
                                  provided_by=gpad_line[9],
                                  date=date,
                                  properties=properties_list)

    return assocparser.ParseResult(source_line, [a], False, report=report)
예제 #6
0
    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        vals = [el.strip() for el in line.split("\t")]
        if len(vals) < 10 or len(vals) > 12:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be between 10 and 12"
                .format(columns=len(vals)))
            return assocparser.ParseResult(line, [], True)

        if len(vals) < 12:
            vals += [""] * (12 - len(vals))

        [
            db, db_object_id, qualifier, goid, reference, evidence, withfrom,
            interacting_taxon_id, date, assigned_by, annotation_xp,
            annotation_properties
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        date = self._normalize_gaf_date(date, split_line)

        if reference == "":
            self.report.error(line, Report.INVALID_ID, "EMPTY",
                              "reference column 6 is empty")
            return assocparser.ParseResult(line, [], True)

        self._validate_id(evidence, split_line)

        interacting_taxon = None if interacting_taxon_id == "" else interacting_taxon_id
        if interacting_taxon != None:
            interacting_taxon = self._taxon_id(interacting_taxon_id,
                                               split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  interacting_taxon_id,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, None)

        # Reference Column
        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        ## --
        ## parse annotation extension
        ## See appending in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        assoc = {
            'source_line': line,
            'subject': {
                'id': id
            },
            'object': {
                'id': goid
            },
            'negated': negated,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': {
                'type': evidence,
                'with_support_from': withfroms,
                'has_supporting_reference': references
            },
            'provided_by': assigned_by,
            'date': date,
        }
        if len(other_qualifiers) > 0:
            assoc['qualifiers'] = other_qualifiers
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            assoc['object']['extensions'] = {'union_of': object_or_exprs}

        return assocparser.ParseResult(line, [assoc], False)
예제 #7
0
    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            if self.version is None:
                # We are still looking
                parsed = parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.0":
                        logger.info("Detected GPAD version 2.0")
                        self.version = version
                    else:
                        logger.info(
                            "Detected GPAD version {}, so defaulting to 1.2".
                            format(version))
                        self.version = self.default_version

            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning(
                "No version number found for this file so we will assume GPAD version: {}"
                .format(self.default_version))
            self.version = self.default_version

        vals = [el.strip() for el in line.split("\t")]

        parsed = to_association(list(vals),
                                report=self.report,
                                version=self.gpad_version(),
                                bio_entities=self.bio_entities)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        go_rule_results = qc.test_go_rules(assoc, self.config)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        assoc = go_rule_results.annotation  # type: association.GoAssociation

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        if not self._validate_id(
                str(assoc.subject.id), split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(
                str(assoc.object.id), split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(str(assoc.object.id),
                                                      split_line)
        if valid_goid is None:
            return assocparser.ParseResult(line, [], True)
        assoc.object.id = association.Curie.from_str(valid_goid)

        if not self._validate_id(str(assoc.evidence.type), split_line):
            return assocparser.ParseResult(line, [], True)

        if assoc.interacting_taxon:
            if not self._validate_taxon(str(assoc.interacting_taxon),
                                        split_line):
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  str(assoc.interacting_taxon),
                                  "Taxon ID is invalid",
                                  rule=27)
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        # Reference Column
        references = self.validate_curie_ids(
            assoc.evidence.has_supporting_reference, split_line)
        if references is None:
            return assocparser.ParseResult(line, [], True)

        # With/From
        for wf in assoc.evidence.with_support_from:
            validated = self.validate_curie_ids(wf.elements, split_line)
            if validated is None:
                return assocparser.ParseResult(line, [], True)

        return assocparser.ParseResult(line, [assoc], False)
예제 #8
0
def from_1_2(gpad_line: List[str],
             report=None,
             group="unknown",
             dataset="unknown",
             bio_entities=None):
    source_line = "\t".join(gpad_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    QUALIFIER = 2
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    if gpad_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[QUALIFIER] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "qualifier column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[EVIDENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "Evidence column is empty",
                     rule=1)

    taxon = association.Curie("NCBITaxon", "0")
    subject_curie = association.Curie(gpad_line[0], gpad_line[1])
    subject = association.Subject(subject_curie, "", [""], [], [], taxon)

    entity = bio_entities.get(subject_curie)
    if entity is not None:
        subject = entity
        taxon = subject.taxon

    go_term = association.Curie.from_str(gpad_line[3])
    if go_term.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[3],
                     "Problem parsing GO Term",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(go_term, taxon)

    evidence_type = association.Curie.from_str(gpad_line[5])
    if evidence_type.is_error():
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[5],
                     "Problem parsing Evidence ECO Curie",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    references = [
        association.Curie.from_str(e) for e in gpad_line[4].split("|") if e
    ]
    for r in references:
        if r.is_error():
            report.error(source_line,
                         Report.INVALID_SYMBOL,
                         gpad_line[4],
                         "Problem parsing references",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    withfroms = association.ConjunctiveSet.str_to_conjunctions(
        gpad_line[6])  # Returns a list of ConjuctiveSets or Error
    if isinstance(withfroms, association.Error):
        report.error(source_line,
                     Report.INVALID_SYMBOL,
                     gpad_line[6],
                     "Problem parsing With/From column",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    evidence = association.Evidence(evidence_type, references, withfroms)

    # Guarenteed to have at least one element, from above check
    raw_qs = gpad_line[QUALIFIER].split("|")
    negated = "NOT" in raw_qs

    looked_up_qualifiers = [
        relations.lookup_label(q) for q in raw_qs if q != "NOT"
    ]
    if None in looked_up_qualifiers:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     raw_qs,
                     "Could not find a URI for qualifier",
                     taxon=str(taxon),
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    qualifiers = [
        association.Curie.from_str(curie_util.contract_uri(q)[0])
        for q in looked_up_qualifiers
    ]

    date = assocparser.parse_date(gpad_line[8], report, source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = None
    if gpad_line[7]:
        taxon_result = gpad_line_validators["taxon"].validate(gpad_line[7])
        if not taxon_result.valid:
            report.error(source_line,
                         Report.INVALID_TAXON,
                         taxon_result.original,
                         taxon_result.message,
                         taxon=str(taxon_result.original),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)
        else:
            interacting_taxon = taxon_result.parsed[0]

    conjunctions = []
    if gpad_line[10]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gpad_line[10],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=str(taxon),
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    properties_list = association.parse_annotation_properties(gpad_line[11])

    # print(properties_list)
    a = association.GoAssociation(source_line=source_line,
                                  subject=subject,
                                  relation=qualifiers[0],
                                  object=object,
                                  negated=negated,
                                  qualifiers=qualifiers,
                                  aspect=None,
                                  interacting_taxon=interacting_taxon,
                                  evidence=evidence,
                                  subject_extensions=[],
                                  object_extensions=conjunctions,
                                  provided_by=gpad_line[9],
                                  date=date,
                                  properties=properties_list)

    return assocparser.ParseResult(source_line, [a], False, report=report)
예제 #9
0
def to_association(gaf_line: List[str],
                   report=None,
                   group="unknown",
                   dataset="unknown") -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
            .format(columns=len(gaf_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[TAXON_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "taxon column is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column 6 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = gaf_line[12].split("|")
    taxon_curie = taxon[0].replace("taxon", "NCBITaxon")
    interacting_taxon = taxon[1].replace(
        "taxon", "NCBITaxon") if len(taxon) == 2 else None
    subject_curie = "{db}:{id}".format(db=gaf_line[0], id=gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], gaf_line[9],
                                  gaf_line[10].split("|"), gaf_line[11],
                                  taxon_curie)
    aspect = gaf_line[8]
    negated, relation, qualifiers = assocparser._parse_qualifier(
        gaf_line[3], aspect)

    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    for q in qualifiers:

        if q not in allowed_qualifiers:
            report.error(
                source_line,
                Report.INVALID_QUALIFIER,
                q,
                "Qualifiers must be `contributes_to`, `colocalizes_with`, or `NOT`",
                taxon=gaf_line[TAXON_INDEX],
                rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    object = association.Term(gaf_line[4], taxon_curie)
    evidence = association.Evidence(ecomap.coderef_to_ecoclass(gaf_line[6]),
                                    [e for e in gaf_line[5].split("|") if e],
                                    [e for e in gaf_line[7].split("|") if e])
    subject_extensions = [
        association.ExtensionUnit("rdfs:subClassOf", gaf_line[16])
    ] if gaf_line[16] else []

    conjunctions = []
    if gaf_line[15]:
        for conjuncts in gaf_line[15].split("|"):
            extension_units = []
            for u in conjuncts.split(","):
                parsed = relation_tuple.findall(u)
                if len(parsed) == 1:
                    rel, term = parsed[0]
                    extension_units.append(association.ExtensionUnit(
                        rel, term))
                else:
                    # Otherwise, something went bad with the regex, and it's a bad parse
                    report.error(source_line,
                                 Report.EXTENSION_SYNTAX_ERROR,
                                 u,
                                 "extensions should be relation(curie)",
                                 taxon=taxon,
                                 rule=1)
                    return assocparser.ParseResult(source_line, [],
                                                   True,
                                                   report=report)

            conjunction = association.ExtensionConjunctions(extension_units)
            conjunctions.append(conjunction)
    object_extensions = association.ExtensionExpression(conjunctions)
    looked_up_rel = relations.lookup_label(relation)
    if looked_up_rel is None:
        report.error(
            source_line,
            assocparser.Report.INVALID_QUALIFIER,
            relation,
            "Qualifer must be \"colocalizes_with\", \"contributes_to\", or \"NOT\"",
            taxon=taxon,
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=curie_util.contract_uri(looked_up_rel)[0],
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=object_extensions,
        provided_by=gaf_line[14],
        date=gaf_line[13],
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)
예제 #10
0
    def parse_line(self, line):
        """
        Parses a single line of a HPOA file

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        config = self.config

        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        # http://human-phenotype-ontology.github.io/documentation.html#annot
        vals = line.split("\t")
        if len(vals) != 14:
            self.report.error(
                line,
                assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 14"
                .format(columns=len(vals)))
            return assocparser.ParseResult(line, [], True)

        [
            db, db_object_id, db_object_symbol, qualifier, hpoid, reference,
            evidence, onset, frequency, withfrom, aspect, db_object_synonym,
            date, assigned_by
        ] = vals

        # hardcode this, as HPOA is currently human-only
        taxon = 'NCBITaxon:9606'

        # hardcode this, as HPOA is currently disease-only
        db_object_type = 'disease'

        ## --
        ## db + db_object_id. CARD=1
        ## --
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, line, ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(hpoid, line, ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        # validation
        #self._validate_symbol(db_object_symbol, line)

        #TODO: HPOA has different date styles
        #date = self._normalize_gaf_date(date, line)

        # Example use case: mapping from OMIM to Orphanet
        if config.entity_map is not None:
            id = self.map_id(id, config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## qualifier
        ## --
        ## we generate both qualifier and relation field
        relation = None
        qualifiers = qualifier.split("|")
        if qualifier == '':
            qualifiers = []
        negated = 'NOT' in qualifiers
        other_qualifiers = [q for q in qualifiers if q != 'NOT']

        ## CURRENTLY NOT USED
        if len(other_qualifiers) > 0:
            relation = other_qualifiers[0]
        else:
            if aspect == 'O':
                relation = 'has_phenotype'
            elif aspect == 'I':
                relation = 'has_inheritance'
            elif aspect == 'M':
                relation = 'mortality'
            elif aspect == 'C':
                relation = 'has_onset'
            else:
                relation = None

        ## --
        ## hpoid
        ## --
        object = {'id': hpoid, 'taxon': taxon}

        # construct subject dict
        subject = {
            'id': id,
            'label': db_object_symbol,
            'type': db_object_type,
            'synonyms': synonyms,
            'taxon': {
                'id': taxon
            }
        }

        ## --
        ## evidence
        ## reference
        ## withfrom
        ## --
        evidence = {
            'type': evidence,
            'has_supporting_reference': reference.split("; ")
        }
        evidence['with_support_from'] = self._split_pipe(withfrom)

        ## Construct main return dict
        assoc = {
            'source_line': line,
            'subject': subject,
            'object': object,
            'negated': negated,
            'qualifiers': qualifiers,
            'relation': {
                'id': relation
            },
            'evidence': evidence,
            'provided_by': assigned_by,
            'date': date,
        }

        self._validate_assoc(assoc, line)

        return assocparser.ParseResult(line, [assoc], False)
예제 #11
0
    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """

        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [], False)

        vals = line.split("\t")
        if len(vals) != 12:
            self.report.error(
                line,
                Report.WRONG_NUMBER_OF_COLUMNS,
                "",
                msg=
                "There were {columns} columns found in this line, and there should be 12"
                .format(columns=len(vals)))
            return assocparser.ParseReslt(line, [], True)

        [
            db,
            db_object_id,
            qualifier,
            goid,
            reference,
            evidence,
            withfrom,
            interacting_taxon_id,  # TODO
            date,
            assigned_by,
            annotation_xp,
            annotation_properties
        ] = vals

        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, line, ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, line, ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        date = self._normalize_gaf_date(date, line)

        self._validate_id(evidence, line, None)
        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, None)

        assocs = []
        xp_ors = annotation_xp.split("|")
        for xp_or in xp_ors:
            xp_ands = xp_or.split(",")
            extns = []
            for xp_and in xp_ands:
                if xp_and != "":
                    expr = self._parse_class_expression(xp_and, line=line)
                    if expr is not None:
                        extns.append(expr)
            assoc = {
                'source_line': line,
                'subject': {
                    'id': id
                },
                'object': {
                    'id': goid,
                    'extensions': extns
                },
                'negated': negated,
                'relation': {
                    'id': relation
                },
                'evidence': {
                    'type': evidence,
                    'with_support_from': self._split_pipe(withfrom),
                    'has_supporting_reference': self._split_pipe(reference)
                },
                'provided_by': assigned_by,
                'date': date,
            }
            if len(other_qualifiers) > 0:
                assoc['qualifiers'] = other_qualifiers

            self._validate_assoc(assoc, line)

            assocs.append(assoc)
        return assocparser.ParseResult(line, assocs, False)
예제 #12
0
    def parse_line(self, line):
        """Parses a single line of a GPAD.

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GPAD file

        """
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        vals = [el.strip() for el in line.split("\t")]

        parsed = to_association(list(vals), report=self.report)
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]

        go_rule_results = qc.test_go_rules(assoc, self.config)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        vals = list(go_rule_results.annotation.to_gpad_tsv())
        [
            db, db_object_id, qualifier, goid, reference, evidence, withfrom,
            interacting_taxon_id, date, assigned_by, annotation_xp,
            annotation_properties
        ] = vals

        split_line = assocparser.SplitLine(line=line, values=vals, taxon="")

        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(id, split_line, context=ENTITY):
            return assocparser.ParseResult(line, [], True)

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        date = self._normalize_gaf_date(date, split_line)

        if reference == "":
            self.report.error(line, Report.INVALID_ID, "EMPTY",
                              "reference column 6 is empty")
            return assocparser.ParseResult(line, [], True)

        self._validate_id(evidence, split_line)

        interacting_taxon = None if interacting_taxon_id == "" else interacting_taxon_id
        if interacting_taxon != None:
            interacting_taxon = self._taxon_id(interacting_taxon_id,
                                               split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  interacting_taxon_id,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        #TODO: ecomap is currently one-way only
        #ecomap = self.config.ecomap
        #if ecomap != None:
        #    if ecomap.ecoclass_to_coderef(evidence) == (None,None):
        #        self.report.error(line, Report.UNKNOWN_EVIDENCE_CLASS, evidence,
        #                          msg="Expecting a known ECO class ID")

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, None)

        # Reference Column
        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        ## --
        ## parse annotation extension
        ## See appending in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        subject_symbol = id
        subject_fullname = id
        subject_synonyms = []
        if self.gpi is not None:
            gp = self.gpi.get(id, {})
            if gp is not {}:
                subject_symbol = gp["symbol"]
                subject_fullname = gp["name"]
                subject_synonyms = gp["synonyms"].split("|")

        assoc = {
            'source_line': line,
            'subject': {
                'id': id,
                'label': subject_symbol,
                'fullname': subject_fullname,
                'synonyms': subject_synonyms,
                'taxon': {
                    'id': interacting_taxon
                },
            },
            'object': {
                'id': goid
            },
            'negated': negated,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': {
                'type': evidence,
                'with_support_from': withfroms,
                'has_supporting_reference': references
            },
            'subject_extensions': [],
            'object_extensions': {},
            'aspect': self.compute_aspect(goid),
            'provided_by': assigned_by,
            'date': date,
        }
        if len(other_qualifiers) > 0:
            assoc['qualifiers'] = other_qualifiers
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            assoc['object_extensions'] = {'union_of': object_or_exprs}

        return assocparser.ParseResult(line, [assoc], False)
예제 #13
0
def to_association(gpad_line: List[str],
                   report=None,
                   group="unknown",
                   dataset="unknown") -> assocparser.ParseResult:

    report = Report(group=group, dataset=dataset) if report is None else report

    source_line = "\t".join(gpad_line)

    if len(gpad_line) > 12:
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 12 columns in this line. Proceeding by cutting off extra columns.",
            rule=1)

        gpad_line = gpad_line[:12]

    if 12 > len(gpad_line) >= 10:
        gpad_line += [""] * (12 - len(gpad_line))

    if len(gpad_line) != 12:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be between 10 and 12"
            .format(columns=len(gpad_line)))
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    QUALIFIER = 2
    REFERENCE_INDEX = 4
    EVIDENCE_INDEX = 5
    if gpad_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[QUALIFIER] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "qualifier column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column is empty",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gpad_line[EVIDENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "Evidence column is empty",
                     rule=1)

    taxon = ""
    subject_curie = "{db}:{id}".format(db=gpad_line[0], id=gpad_line[1])
    subject = association.Subject(subject_curie, "", "", [], "", "")
    object = association.Term(gpad_line[3], "")
    evidence = association.Evidence(gpad_line[5],
                                    [e for e in gpad_line[4].split("|") if e],
                                    [e for e in gpad_line[6].split("|") if e])

    raw_qs = gpad_line[2].split("|")
    negated = "NOT" in raw_qs
    looked_up_qualifiers = [
        relations.lookup_label(q) for q in raw_qs if q != "NOT"
    ]
    if None in looked_up_qualifiers:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     raw_qs,
                     "Could not find a URI for qualifier",
                     taxon=taxon,
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    qualifiers = [curie_util.contract_uri(q)[0] for q in looked_up_qualifiers]

    conjunctions = []
    if gpad_line[11]:
        for conjuncts in gpad_line[11].split("|"):
            extension_units = []
            for u in conjuncts.split(","):
                parsed = relation_tuple.findall(u)
                if len(parsed) == 1:
                    rel, term = parsed[0]
                    extension_units.append(association.ExtensionUnit(
                        rel, term))
                else:
                    # Otherwise, something went bad with the regex, and it's a bad parse
                    report.error(source_line,
                                 Report.EXTENSION_SYNTAX_ERROR,
                                 u,
                                 "extensions should be relation(curie)",
                                 taxon=taxon,
                                 rule=1)
                    return assocparser.ParseResult(source_line, [],
                                                   True,
                                                   report=report)

            conjunction = association.ExtensionConjunctions(extension_units)
            conjunctions.append(conjunction)
    object_extensions = association.ExtensionExpression(conjunctions)

    properties_list = [
        prop.split("=") for prop in gpad_line[11].split("|") if prop
    ]
    # print(properties_list)
    a = association.GoAssociation(
        source_line="\t".join(gpad_line),
        subject=subject,
        relation="",
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=None,
        interacting_taxon=gpad_line[7],
        evidence=evidence,
        subject_extensions=[],
        object_extensions=object_extensions,
        provided_by=gpad_line[9],
        date=gpad_line[8],
        properties={prop[0]: prop[1]
                    for prop in properties_list if prop})

    return assocparser.ParseResult(source_line, [a], False, report=report)
예제 #14
0
def to_association(
    gaf_line: List[str],
    report=None,
    group="unknown",
    dataset="unknown",
    qualifier_parser=Qualifier2_1()) -> assocparser.ParseResult:
    report = Report(group=group, dataset=dataset) if report is None else report
    source_line = "\t".join(gaf_line)

    if source_line == "":
        report.error(source_line,
                     "Blank Line",
                     "EMPTY",
                     "Blank lines are not allowed",
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    if len(gaf_line) > 17:
        # If we see more than 17 columns, we will just cut off the columns after column 17
        report.warning(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.",
            rule=1)
        gaf_line = gaf_line[:17]

    if 17 > len(gaf_line) >= 15:
        gaf_line += [""] * (17 - len(gaf_line))

    if len(gaf_line) != 17:
        report.error(
            source_line,
            assocparser.Report.WRONG_NUMBER_OF_COLUMNS,
            "",
            msg=
            "There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)"
            .format(columns=len(gaf_line)),
            rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    ## check for missing columns
    ## We use indeces here because we run GO RULES before we split the vals into individual variables
    DB_INDEX = 0
    DB_OBJECT_INDEX = 1
    TAXON_INDEX = 12
    REFERENCE_INDEX = 5
    if gaf_line[DB_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_IDSPACE,
                     "EMPTY",
                     "col1 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[DB_OBJECT_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "col2 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[TAXON_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_TAXON,
                     "EMPTY",
                     "taxon column is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)
    if gaf_line[REFERENCE_INDEX] == "":
        report.error(source_line,
                     Report.INVALID_ID,
                     "EMPTY",
                     "reference column 6 is empty",
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    taxon = gaf_line[12].split("|")
    taxon_curie = taxon[0].replace("taxon", "NCBITaxon")
    date = assocparser._normalize_gaf_date(gaf_line[13], report, taxon_curie,
                                           source_line)
    if date is None:
        return assocparser.ParseResult(source_line, [], True, report=report)

    interacting_taxon = taxon[1].replace(
        "taxon", "NCBITaxon") if len(taxon) == 2 else None
    subject_curie = "{db}:{id}".format(db=gaf_line[0], id=gaf_line[1])
    subject = association.Subject(subject_curie, gaf_line[2], gaf_line[9],
                                  gaf_line[10].split("|"), gaf_line[11],
                                  taxon_curie)
    aspect = gaf_line[8]
    negated, relation, qualifiers = assocparser._parse_qualifier(
        gaf_line[3], aspect)

    # column 4 is qualifiers -> index 3
    # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers
    parsed_qualifiers = qualifier_parser.validate(gaf_line[3])
    if not parsed_qualifiers.valid:
        report.error(source_line,
                     Report.INVALID_QUALIFIER,
                     parsed_qualifiers.original,
                     parsed_qualifiers.message,
                     taxon=gaf_line[TAXON_INDEX],
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    object = association.Term(gaf_line[4], taxon_curie)
    evidence = association.Evidence(
        ecomap.coderef_to_ecoclass(gaf_line[6]),
        [e for e in gaf_line[5].split("|") if e],
        association.ConjunctiveSet.str_to_conjunctions(gaf_line[7]))

    subject_extensions = [
        association.ExtensionUnit("rdfs:subClassOf", gaf_line[16])
    ] if gaf_line[16] else []

    conjunctions = []
    if gaf_line[15]:
        conjunctions = association.ConjunctiveSet.str_to_conjunctions(
            gaf_line[15],
            conjunct_element_builder=lambda el: association.ExtensionUnit.
            from_str(el))

        if isinstance(conjunctions, association.Error):
            report.error(source_line,
                         Report.EXTENSION_SYNTAX_ERROR,
                         conjunctions.info,
                         "extensions should be relation(curie)",
                         taxon=taxon,
                         rule=1)
            return assocparser.ParseResult(source_line, [],
                                           True,
                                           report=report)

    looked_up_rel = relations.lookup_label(relation)
    if looked_up_rel is None:
        report.error(source_line,
                     assocparser.Report.INVALID_QUALIFIER,
                     relation,
                     "Could not find CURIE for relation `{}`".format(relation),
                     taxon=taxon,
                     rule=1)
        return assocparser.ParseResult(source_line, [], True, report=report)

    a = association.GoAssociation(
        source_line="\t".join(gaf_line),
        subject=subject,
        relation=curie_util.contract_uri(looked_up_rel)[0],
        object=object,
        negated=negated,
        qualifiers=qualifiers,
        aspect=aspect,
        interacting_taxon=interacting_taxon,
        evidence=evidence,
        subject_extensions=subject_extensions,
        object_extensions=conjunctions,
        provided_by=gaf_line[14],
        date=date,
        properties={})

    return assocparser.ParseResult(source_line, [a], False, report=report)
예제 #15
0
    def parse_line(self, line):
        """
        Parses a single line of a GAF

        Return a tuple `(processed_line, associations)`. Typically
        there will be a single association, but in some cases there
        may be none (invalid line) or multiple (disjunctive clause in
        annotation extensions)

        Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes,
        :method:`parse_file` can be used over the whole file

        Arguments
        ---------
        line : str
            A single tab-seperated line from a GAF file

        """

        # Returns assocparser.ParseResult
        parsed = super().validate_line(line)
        if parsed:
            return parsed

        if self.is_header(line):
            # Save off version info here
            if self.version is None:
                # We are still looking
                parsed = assocparser.parser_version_regex.findall(line)
                if len(parsed) == 1:
                    filetype, version, _ = parsed[0]
                    if version == "2.2":
                        logger.info("Detected GAF version 2.2")
                        self.version = version
                    else:
                        logger.info(
                            "Detected GAF version {}, so using 2.1".format(
                                version))
                        self.version = self.default_version

            return assocparser.ParseResult(line, [{
                "header": True,
                "line": line.strip()
            }], False)

        # At this point, we should have gone through all the header, and a version number should be established
        if self.version is None:
            logger.warning(
                "No version number found for this file so we will assum GAF version: {}"
                .format(self.default_version))
            self.version = self.default_version

        vals = [el.strip() for el in line.split("\t")]

        # GAF v1 is defined as 15 cols, GAF v2 as 17.
        # We treat everything as GAF2 by adding two blank columns.
        # TODO: check header metadata to see if columns corresponds to declared dataformat version

        parsed = to_association(list(vals),
                                report=self.report,
                                qualifier_parser=self.qualifier_parser())
        if parsed.associations == []:
            return parsed

        assoc = parsed.associations[0]
        # self.report = parsed.report
        ## Run GO Rules, save split values into individual variables
        go_rule_results = qc.test_go_rules(assoc,
                                           self.config,
                                           group=self.group)
        for rule, result in go_rule_results.all_results.items():
            if result.result_type == qc.ResultType.WARNING:
                self.report.warning(line,
                                    assocparser.Report.VIOLATES_GO_RULE,
                                    "",
                                    msg="{id}: {message}".format(
                                        id=rule.id, message=result.message),
                                    rule=int(rule.id.split(":")[1]))

            if result.result_type == qc.ResultType.ERROR:
                self.report.error(line,
                                  assocparser.Report.VIOLATES_GO_RULE,
                                  "",
                                  msg="{id}: {message}".format(
                                      id=rule.id, message=result.message),
                                  rule=int(rule.id.split(":")[1]))
                # Skip the annotation
                return assocparser.ParseResult(line, [], True)

            if result.result_type == qc.ResultType.PASS:
                self.report.message(assocparser.Report.INFO,
                                    line,
                                    Report.RULE_PASS,
                                    "",
                                    msg="Passing Rule",
                                    rule=int(rule.id.split(":")[1]))

        vals = list(go_rule_results.annotation.to_gaf_tsv())
        [
            db, db_object_id, db_object_symbol, qualifier, goid, reference,
            evidence, withfrom, aspect, db_object_name, db_object_synonym,
            db_object_type, taxon, date, assigned_by, annotation_xp,
            gene_product_isoform
        ] = vals
        split_line = assocparser.SplitLine(line=line, values=vals, taxon=taxon)

        if self.config.group_idspace is not None and assigned_by not in self.config.group_idspace:
            self.report.warning(
                line,
                Report.INVALID_ID,
                assigned_by,
                "GORULE:0000027: assigned_by is not present in groups reference",
                taxon=taxon,
                rule=27)

        if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
            # Are we a synonym?
            upgrade = self.config.entity_idspaces.reverse(db)
            if upgrade is not None:
                # If we found a synonym
                self.report.warning(
                    line,
                    Report.INVALID_ID_DBXREF,
                    db,
                    "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated"
                    .format(db, upgrade),
                    taxon=taxon,
                    rule=27)
                db = upgrade

        ## --
        ## db + db_object_id. CARD=1
        ## --assigned_by
        id = self._pair_to_id(db, db_object_id)
        if not self._validate_id(
                id, split_line, allowed_ids=self.config.entity_idspaces):

            return assocparser.ParseResult(line, [], True)

        # Using a given gpi file to validate the gene object
        if self.gpi is not None:
            entity = self.gpi.get(id, None)
            if entity is not None:
                db_object_symbol = entity["symbol"]
                db_object_name = entity["name"]
                db_object_synonym = entity["synonyms"]
                db_object_type = entity["type"]

        if not self._validate_id(goid, split_line, context=ANNOTATION):
            print("skipping because {} not validated!".format(goid))
            return assocparser.ParseResult(line, [], True)

        valid_goid = self._validate_ontology_class_id(goid, split_line)
        if valid_goid == None:
            return assocparser.ParseResult(line, [], True)
        goid = valid_goid

        ecomap = self.config.ecomap
        if ecomap is not None:
            if ecomap.coderef_to_ecoclass(evidence, reference) is None:
                self.report.error(
                    line,
                    assocparser.Report.UNKNOWN_EVIDENCE_CLASS,
                    evidence,
                    msg="Expecting a known ECO GAF code, e.g ISS",
                    rule=1)
                return assocparser.ParseResult(line, [], True)

        references = self.validate_pipe_separated_ids(reference, split_line)
        if references == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # With/From
        withfroms = self.validate_pipe_separated_ids(withfrom,
                                                     split_line,
                                                     empty_allowed=True,
                                                     extra_delims=",")
        if withfroms == None:
            # Reporting occurs in above function call
            return assocparser.ParseResult(line, [], True)

        # validation
        self._validate_symbol(db_object_symbol, split_line)

        # Example use case: mapping from UniProtKB to MOD ID
        if self.config.entity_map is not None:
            id = self.map_id(id, self.config.entity_map)
            toks = id.split(":")
            db = toks[0]
            db_object_id = toks[1:]
            vals[1] = db_object_id

        ## --
        ## end of line re-processing
        ## --
        # regenerate line post-mapping
        line = "\t".join(vals)

        ## --
        ## taxon CARD={1,2}
        ## --
        ## if a second value is specified, this is the interacting taxon
        ## We do not use the second value
        taxons = taxon.split("|")
        normalized_taxon = self._taxon_id(taxons[0], split_line)
        if normalized_taxon == None:
            self.report.error(line,
                              assocparser.Report.INVALID_TAXON,
                              taxon,
                              msg="Taxon ID is invalid")
            return assocparser.ParseResult(line, [], True)

        self._validate_taxon(normalized_taxon, split_line)

        interacting_taxon = None
        if len(taxons) == 2:
            interacting_taxon = self._taxon_id(taxons[1], split_line)
            if interacting_taxon == None:
                self.report.error(line,
                                  assocparser.Report.INVALID_TAXON,
                                  taxon,
                                  msg="Taxon ID is invalid")
                return assocparser.ParseResult(line, [], True)

        ## --
        ## db_object_synonym CARD=0..*
        ## --
        synonyms = db_object_synonym.split("|")
        if db_object_synonym == "":
            synonyms = []

        ## --
        ## parse annotation extension
        ## See appendix in http://doi.org/10.1186/1471-2105-15-155
        ## --
        object_or_exprs = self._parse_full_extension_expression(
            annotation_xp, line=split_line)

        ## --
        ## qualifier
        ## --
        negated, relation, other_qualifiers = self._parse_qualifier(
            qualifier, aspect)

        ## --
        ## goid
        ## --
        # TODO We shouldn't overload buildin keywords/functions
        object = {'id': goid, 'taxon': normalized_taxon}

        # construct subject dict
        subject = {
            'id': id,
            'label': db_object_symbol,
            'type': db_object_type,
            'fullname': db_object_name,
            'synonyms': synonyms,
            'taxon': {
                'id': normalized_taxon
            }
        }

        ## --
        ## gene_product_isoform
        ## --
        ## This is mapped to a more generic concept of subject_extensions
        subject_extns = []
        if gene_product_isoform is not None and gene_product_isoform != '':
            subject_extns.append({
                'property': 'isoform',
                'filler': gene_product_isoform
            })

        object_extensions = {}
        if object_or_exprs is not None and len(object_or_exprs) > 0:
            object_extensions['union_of'] = object_or_exprs

        ## --
        ## evidence
        ## reference
        ## withfrom
        ## --
        evidence_obj = {
            'type': evidence,
            'has_supporting_reference': references,
            'with_support_from': withfroms
        }

        ## Construct main return dict
        assoc = {
            'source_line': line,
            'subject': subject,
            'object': object,
            'negated': negated,
            'qualifiers': other_qualifiers,  # should be either 0 or 1 item
            'aspect': aspect,
            'relation': {
                'id': relation
            },
            'interacting_taxon': interacting_taxon,
            'evidence': evidence_obj,
            'provided_by': assigned_by,
            'date': date,
            'subject_extensions': subject_extns,
            'object_extensions': object_extensions
        }

        return assocparser.ParseResult(line, [assoc], False, evidence.upper())