示例#1
0
def gen_tasks(tree: _ElementTree) -> Iterator[Task]:
    for task_node in tree.findall('/body/ul/li'):
        task_dict = dict(node_to_dict(task_node))
        recurrence = None
        if task_dict.get('Recurrence info'):
            assert isinstance(task_dict, dict)
            recnode = cast(Dict[str, str], task_dict['Recurrence info'])
            recurrence = Recurrence(
                frequency=cast(Frequency, recnode['Frequency']),
                start=ensure(parse_timestamp_ms(recnode['Start'])),
                end=ensure(parse_timestamp_ms(recnode['End'])),
                hour=int(recnode['Hour of day to fire']),
                every=maybe(recnode.get('Every'), int) or 1,
                weekday_num=maybe(recnode.get('Weekday number'), int),
                day_of_month=maybe(recnode.get('Day number of month'),
                                   parse_day_num),
                day_of_week=maybe(recnode.get('Day of week'),
                                  lambda x: cast(Weekday, x)),
                month=maybe(recnode.get('Month of year'),
                            lambda x: cast(Month, x)),
            )
        simple_fields = cast(Dict[str, str], task_dict)
        task = Task(title=simple_fields['Title'],
                    created=ensure(
                        parse_timestamp_ms(simple_fields['Created time'])),
                    state=cast(State, simple_fields['State']),
                    due=maybe(simple_fields.get('Due date'),
                              lambda x: parse_timestamp_ms(x)),
                    recurrence=recurrence)
        print(task)
        yield task
示例#2
0
def get_mentions(tree: etree._ElementTree,) -> ty.Dict[ty.Tuple[str, str], Mention]:
    """Extract the mentions from an ANCOR-TEI document."""
    mentions = tree.xpath(
        (
            './tei:standOff/tei:annotation[@tei:type="coreference"]'
            '/tei:spanGrp[@tei:subtype="mention"]/tei:span'
        ),
        namespaces=NSMAP,
    )
    if not mentions:
        raise ValueError("`tree` has no mention spans")

    features = get_fs(tree)

    texts_lst = tree.findall(f"{TEI}text")
    if not texts_lst:
        raise ValueError(
            "Attempting to extract mentions from a document without a text"
        )

    tokens_id_store = {
        xmlid(elt): elt for text in texts_lst for elt in text.iter(*TOKEN_TAGS)
    }

    res = dict()
    for m_elt in mentions:
        try:
            m = Mention.from_urs(m_elt, tokens_id_store.get, features.get)
        except ValueError as e:
            logger.warning(f"Skipping span {xmlid(m)}: {e}")
            continue
        if m.span_type not in MENTION_TYPES:
            if m.span_type in IGNORED_MENTION_TYPES:
                logger.debug(
                    f"Ignoring span {m.identifier!r} with mention type {m.span_type!r}"
                )
            else:
                logger.warning(
                    f"Span {m.identifier!r} has an invalid mention type ({m.span_type!r})"
                )
            continue
        res[(xmlid(m.targets[0]), xmlid(m.targets[-1]))] = m
    return res
示例#3
0
def scan_fields(tree: _ElementTree) -> List[Dict[str, Set[str]]]:
    task_fields: Dict[str, Set[str]] = {}
    location_fields: Dict[str, Set[str]] = {}
    recurrence_fields: Dict[str, Set[str]] = {}
    for task_node in tree.findall('/body/ul/li'):
        task_dict = dict(node_to_dict(task_node))
        for key, val in task_dict.items():
            if key not in ['Recurrence info', 'Location']:
                task_fields.setdefault(key, set()).add(cast(str, val))
        if task_dict.get('Location'):
            location_dict = cast(Dict[str, str], task_dict['Location'])
            for key, val in location_dict.items():
                location_fields.setdefault(key, set()).add(val)
        if task_dict.get('Recurrence info'):
            recurrence_dict = cast(Dict[str, str],
                                   task_dict['Recurrence info'])
            for key, val in recurrence_dict.items():
                recurrence_fields.setdefault(key, set()).add(val)
    task_fields = chop(task_fields)
    location_fields = chop(location_fields)
    recurrence_fields = chop(recurrence_fields)

    return [task_fields, recurrence_fields, location_fields]
示例#4
0
def test__parse_node(valid_nodl: etree._ElementTree):
    nodes = valid_nodl.findall('node')
    node = nodl._parsing._v1._parsing._parse_node(nodes[1])
    assert node.actions and node.parameters and node.services and node.topics
示例#5
0
def spans_from_doc(
    doc: etree._ElementTree,
    min_width: int = 1,
    max_width: int = 26,
    context: ty.Tuple[int, int] = (10, 10),
    length_buckets: ty.Optional[ty.Sequence[int]] = (1, 2, 3, 4, 5, 7, 15, 32, 63),
) -> ty.Iterable[MentionFeaturesDict]:
    """
    Return all the text spans of `#doc`, with their mention type, definiteness and anaphoricity
    (for those who are not mentions, all of these are `None`)
    """
    w_pos = get_w_pos(doc)
    units = get_mentions(doc)
    nlp = spacy.load("fr_core_news_lg")
    nlp.tokenizer = CustomTokenizer(nlp.vocab)

    for utterance in doc.findall(".//tei:u", namespaces=NSMAP):
        content: ty.List[etree._Element] = list(utterance.iter(*TOKEN_TAGS))
        processed_utterance = nlp([t.text for t in content])
        ent_dict = {(e[0], e[-1]): e.label_ for e in processed_utterance.ents}
        noun_chunks = sorted(processed_utterance.noun_chunks)
        spans = generate_spans_with_context(
            zip(content, ty.cast(ty.Iterable[spacy.tokens.Token], processed_utterance)),
            min_width,
            max_width,
            *context,
        )
        for left_context_t, span_t, right_context_t in spans:
            #  FIXME: Dirty way to split out spacy processing
            left_context, processed_left = (
                zip(*left_context_t) if left_context_t else ([], [])
            )
            right_context, processed_right = (
                zip(*right_context_t) if right_context_t else ([], [])
            )
            span, processed_span = zip(*span_t)
            start_id, end_id = xmlid(span[0]), xmlid(span[-1])
            mention = units.get((start_id, end_id))

            pos = [w.pos_ for w in (*processed_left, *processed_span, *processed_right)]
            lemma = [
                w.lemma_ for w in (*processed_left, *processed_span, *processed_right)
            ]
            morph = [
                morph_from_tag(w.tag_)
                for w in (*processed_left, *processed_span, *processed_right)
            ]
            left_context = [w.text for w in left_context]
            right_context = [w.text for w in right_context]
            if len(left_context) < context[0]:
                left_context.insert(0, "<start>")
                pos.insert(0, "<start>")
                lemma.insert(0, "<start>")
                morph.insert(0, [])
            if len(right_context) < context[1]:
                right_context.append("<end>")
                pos.append("<end>")
                lemma.append("<end>")
                morph.append([])

            content = [w.text for w in span]

            length = (
                int(np.digitize(len(content), bins=length_buckets, right=True))
                if length_buckets is not None
                else len(content)
            )
            entity_type = ent_dict.get((processed_span[0], processed_span[-1]), None)
            chunk_inclusion = span_inclusion(processed_span, noun_chunks)

            if mention is None:
                yield (
                    {
                        "content": content,
                        "left_context": left_context,
                        "right_context": right_context,
                        "length": length,
                        "type": None,
                        "new": None,
                        "def": None,
                        "id": None,
                        "start": w_pos[xmlid(span[0])],
                        "end": w_pos[xmlid(span[-1])],
                        "pos": pos,
                        "lemma": lemma,
                        "morph": morph,
                        "entity_type": entity_type,
                        "chunk_inclusion": chunk_inclusion,
                    }
                )
            else:
                yield {
                    "content": content,
                    "left_context": left_context,
                    "right_context": right_context,
                    "length": length,
                    "type": mention.span_type,
                    "new": mention.features.get("NEW", "_"),
                    "def": mention.features.get("DEF", "_"),
                    "id": mention.identifier,
                    "start": w_pos[xmlid(span[0])],
                    "end": w_pos[xmlid(span[-1])],
                    "pos": pos,
                    "lemma": lemma,
                    "morph": morph,
                    "entity_type": entity_type,
                    "chunk_inclusion": chunk_inclusion,
                }