Exemplo n.º 1
0
def pysbd_sentencizer(doc: Doc) -> Doc:
    """Adds sentence boundaries to a Doc.
    Intended to be used as a pipe in a spaCy pipeline.
    Uses https://github.com/nipunsadvilkar/pySBD to get proper sentence and
    respective char_spans

    Handle special cases:
    New lines cannot be end of sentence tokens.
    New lines that separate sentences will be added to the
    beginning of the next sentence.

    @param doc: the spaCy document to be annotated with sentence boundaries
    """
    segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
    sents_char_spans: List[TextSpan] = segmenter.segment(doc.text)

    char_spans = [
        doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans
    ]
    start_token_char_offsets = [span[0].idx for span in char_spans if span is not None]
    for token in doc:
        prev_token = token.nbor(-1) if token.i != 0 else None
        if token.idx in start_token_char_offsets:
            if prev_token and prev_token.text in ABBREVIATIONS:
                token.is_sent_start = False
            else:
                token.is_sent_start = True
        # check if previous token contains more than 2 newline chars
        elif prev_token and prev_token.i != 0 and prev_token.text.count("\n") >= 2:
            token.is_sent_start = True
        else:
            token.is_sent_start = False
    return doc
Exemplo n.º 2
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    header = True
    with open(input_path, "r") as in_f, open(output_path, "w") as out_f:
        for line in tqdm(in_f):
            if header:
                header = False
                continue
            sentence, tokens = pd.read_csv(StringIO(line),
                                           header=None,
                                           usecols=[0, 1]).values[0]
            tokens = eval(tokens)
            dict_line = line_to_dict(sentence, tokens)
            eg = dict_line

            if eg["answer"] != "accept":
                continue
            tokens = [token["text"] for token in eg["tokens"]]
            words, spaces = get_words_and_spaces(tokens, eg["text"])
            doc = Doc(nlp.vocab, words=words, spaces=spaces)
            doc.ents = [
                doc.char_span(s["start"], s["end"], label=s["label"])
                for s in eg.get("spans", [])
            ]
            doc_bin.add(doc)
        doc_bin.to_disk(output_path)
        print(f"Processed {len(doc_bin)} documents: {output_path}")
Exemplo n.º 3
0
def _mk_spacy_doc(tokens, entities):
    nlp = spacy.blank("en")
    doc = Doc(nlp.vocab, words=tokens, spaces=[True for _ in tokens])
    for ent in entities:
        span = doc.char_span(ent["start"], ent["end"], label=ent["entity"])
        doc.ents = list(doc.ents) + [span]
    return doc
Exemplo n.º 4
0
def get_doc_char_span(
    doc: Doc, i: int, j: int, destructive: bool = True, **kwargs
) -> Optional[Span]:
    """Get Span from Doc with char position, similar to doc.char_span.

    Args:
        i: The index of the first character of the span
        j: The index of the first character after the span
        destructive: If True, tokens in [i,j) will be splitted and make sure to return span.
        kwargs: passed to Doc.char_span
    """
    span = doc.char_span(i, j, **kwargs)
    if not span and destructive:
        destruct_token(doc, i, j)
        span = doc.char_span(i, j, **kwargs)
    return span
Exemplo n.º 5
0
def get_doc_char_span(
    doc: Doc, i: int, j: int, destructive: bool = True, covering: bool = False, **kwargs
) -> Optional[Span]:
    """Get Span from Doc with char position, similar to doc.char_span.

    Args:
        i: The index of the first character of the span
        j: The index of the first character after the span
        destructive: If True, tokens in [i,j) will be splitted and make sure to return span.
        covering: If True, [i,j) will be adjusted to match the existing token boundaries. It precedes `destructive`.
        kwargs: passed to Doc.char_span
    """
    span = doc.char_span(i, j, **kwargs)
    if not span and covering:
        span = _get_covering_span(doc, i, j, **kwargs)
    if not span and destructive:
        destruct_token(doc, i, j)
        span = doc.char_span(i, j, **kwargs)
    return span
Exemplo n.º 6
0
def _mk_spacy_doc(tokens, entities):
    nlp = spacy.blank("en")
    doc = Doc(nlp.vocab, words=tokens, spaces=[True for _ in tokens])
    # This is a checking mechanism. Rasa allows for overlapping intents.
    # spaCy totally does not do that.
    taken = []
    warn = False
    for ent in entities:
        if (ent["start"], ent["end"]) not in taken:
            span = doc.char_span(ent["start"], ent["end"], label=ent["entity"])
            doc.ents = list(doc.ents) + [span]
            taken.append((ent["start"], ent["end"]))
        else:
            warn = True
    return doc, warn
Exemplo n.º 7
0
    def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]:
        """Runs the parser on the spacy document, and convert the result to labels."""

        text = doc.text

        # The current version of Snips has a bug that makes it crash with some rare
        # Turkish characters, or mentions of "billion years"
        text = text.replace("’",
                            "'").replace("”",
                                         "\"").replace("“",
                                                       "\"").replace("—", "-")
        text = text.encode("iso-8859-15", "ignore").decode("iso-8859-15")
        text = re.sub(
            "(\\d+) ([bm]illion(?: (?:\\d+|one|two|three|four|five|six|seven" +
            "|eight|nine|ten))? years?)", "\\g<1>.0 \\g<2>", text)

        results = self.parser.parse(text)
        for result in results:
            span = doc.char_span(result["range"]["start"],
                                 result["range"]["end"])
            if span is None or span.text.lower() in {"now"
                                                     } or span.text in {"may"}:
                continue
            label = None
            if (result["entity_kind"] == "snips/number" and span.text.lower()
                    not in {"one", "some", "few", "many", "several"}):
                label = "CARDINAL"
            elif (result["entity_kind"] == "snips/ordinal"
                  and span.text.lower()
                  not in {"first", "second", "the first", "the second"}):
                label = "ORDINAL"
            elif result["entity_kind"] == "snips/temperature":
                label = "QUANTITY"
            elif result["entity_kind"] == "snips/amountOfMoney":
                label = "MONEY"
            elif result["entity_kind"] == "snips/percentage":
                label = "PERCENT"
            elif result["entity_kind"] in {
                    "snips/date", "snips/datePeriod", "snips/datetime"
            }:
                label = "DATE"
            elif result["entity_kind"] in {"snips/time", "snips/timePeriod"}:
                label = "TIME"

            if label:
                yield span.start, span.end, label
Exemplo n.º 8
0
    def __call__(self, text: str) -> Doc:
        dtokens = self.detailed_tokens(text)
        words = [x.surface for x in dtokens]
        spaces = [x.space for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=spaces)
        for token, dtoken in zip(doc, dtokens):
            token.tag_ = dtoken.pos
            token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text
            token._.set(self.key_fstring, dtoken.fstring)

        with doc.retokenize() as retokenizer:
            for match in RE_URL.finditer(doc.text):
                span = doc.char_span(*match.span())
                if span:
                    retokenizer.merge(span)
        doc.is_tagged = True
        return doc
Exemplo n.º 9
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for eg in tqdm(srsly.read_jsonl(input_path)):
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Exemplo n.º 10
0
def main(input_path: Path = typer.Argument(..., exists=True, dir_okay=False)):
    print("Read params.yaml...")
    with open("params.yaml", "r") as fd:
        params = yaml.safe_load(fd)
    dev_size = params["train"]["corpora"]["dev_size"]
    shuffle_seed = params["train"]["corpora"]["shuffle_seed"]
    print(f"...read dev_size={dev_size}, shuffle_seed={shuffle_seed}")

    print("Read annotations...")
    corpus = list(srsly.read_jsonl(input_path))
    print(f"...read {len(corpus)} texts")

    print("Convert into documents...")
    docs = []
    nlp = spacy.blank("en")
    for eg in corpus:
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        docs.append(doc)
    print(f"...converted {len(docs)} documents")

    print("Split into train and dev...")
    train, dev = train_test_split(docs,
                                  test_size=dev_size,
                                  random_state=shuffle_seed,
                                  shuffle=True)
    print(f"...split into {len(train)} train and {len(dev)} dev documents")

    print("Write serialized documents...")
    for split, data in [("train", train), ("dev", dev)]:
        output_path = input_path.with_suffix(f".{split}.spacy")
        doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], docs=data)
        doc_bin.to_disk(output_path)
        print(f"...wrote {output_path}")
Exemplo n.º 11
0
def main(json_loc: Path,
         train_file: Path,
         dev_file: Path,
         test_file: Path,
         test_split=0.189,
         train_split=0.709):
    """Creating the corpus from the Prodigy annotations."""
    Doc.set_extension("rel", default={})
    vocab = Vocab()

    docs = {"train": [], "dev": [], "test": []}
    ids = {"train": set(), "dev": set(), "test": set()}
    count_all = {"train": 0, "dev": 0, "test": 0}
    count_pos = {"train": 0, "dev": 0, "test": 0}

    long_rel_count = 0  #how many relations are longer
    error_count_rel = 0  #how often is something different than ARGO, ARG1, ARG

    with json_loc.open("r", encoding="utf8") as jsonfile:
        length_training_data = len([
            True for line in jsonfile if json.loads(line)["answer"] == "accept"
        ])
        msg.info(f"Number of accepted recipes: {length_training_data}")

    with json_loc.open("r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            example = json.loads(line)  #one recipe
            span_starts = set()

            if example["answer"] == "accept":
                neg = 0
                pos = 0
                try:
                    # Parse the tokens -> example["tokens"] = list of dicts
                    words = [t["text"] for t in example["tokens"]
                             ]  #list containing all words
                    spaces = [
                        t["ws"] for t in example["tokens"]
                    ]  #list containing ws is behind word (ws = True/False)
                    doc = Doc(vocab, words=words, spaces=spaces)

                    # Parse the entities
                    spans = example[
                        "spans"]  #list of dicts containing entities
                    entities = []
                    span_end_to_start = {}
                    ents_dict = {}
                    for span in spans:  #every detected span
                        entity = doc.char_span(
                            span["start"], span["end"], label=span["label"]
                        )  #"start" = wievielter character ist start character des spans im doc
                        span_end_to_start[span["token_end"]] = span[
                            "token_start"]  #end_token of span as key for start_token (start token = wievielter token in doc)
                        entities.append(entity)  #appended to list
                        span_starts.add(span["token_start"])  #added to set
                        ents_dict[span["token_start"]] = (span["label"],
                                                          span["token_start"])
                    doc.ents = entities  #entity list assigned as doc entites

                    # Parse the relations
                    rels = {}

                    # create token combinations
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 1a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 1a
                                        if DIFF_FRONT_BACK == True:

                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                rels[(x1, x2)] = {}

                                            else:
                                                pass
                                        #DIFF_FRONT_BACK 1b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                rels[(x1, x2)] = {
                                                }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...
                        #VERBS_TO_OTHER 1b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 2a
                                if DIFF_FRONT_BACK == True:

                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        rels[(x1, x2)] = {}

                                    else:
                                        pass
                                #DIFF_FRONT_BACK 2b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        rels[(x1, x2)] = {
                                        }  #every possible span combination becomes key for individual dict (1,1), (1,2) ...

                    relations = example[
                        "relations"]  #relations is list of dict
                    for relation in relations:
                        # the 'head' and 'child' annotations refer to the end token in the span
                        # but we want the first token
                        start = span_end_to_start[relation[
                            "head"]]  #wievielter token ist start token des head
                        end = span_end_to_start[relation[
                            "child"]]  #wievielter token ist start token des child
                        label = relation["label"]

                        #DETAILED_ARGS 1a
                        if DETAILED_ARGS == True:
                            if label == "ARG0":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG0[ents_dict[end][
                                        0]]  #assign new label based on span type
                            elif label == "ARG1":
                                if ents_dict[end][0] not in ["Z", "TOOL"]:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                                else:
                                    label = MAP_LABELS_ARG1[ents_dict[end][0]]
                            elif label == "ARG":
                                if ents_dict[end][0] in ["Z", "TOOL"]:
                                    if ents_dict[end][0] == "Z":
                                        label = "Arg0Z"
                                    elif ents_dict[end][0] == "TOOL":
                                        label = "Arg1Tool"
                                else:
                                    label = MAP_LABELS_ARG[ents_dict[end][0]]
                            else:
                                error_count_rel += 1

                        #DETAILED_ARGS 1b
                        else:
                            label = MAP_LABELS_STANDARD[
                                label]  #MAP_LABELS = dict containing label as key

                        # Positive relations are being added
                        try:
                            if label not in rels[(
                                    start, end
                            )]:  #check if label already exists for token combination
                                rels[(
                                    start, end
                                )][label] = 1.0  #initialize label as new key with value 1.0
                                pos += 1  #positive case
                        except:
                            long_rel_count += 1  #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb)
                            pass

                    # The annotation is complete, so fill in zero's where the data is missing
                    for x1 in span_starts:

                        #VERBS_TO_OTHER 2a
                        if VERBS_TO_OTHER == True:
                            if ents_dict[x1][0] == "V":  #filter entity type
                                for x2 in span_starts:
                                    if ents_dict[x2][0] in [
                                            "Z", "TOOL", "ATTR", "TEMP",
                                            "DAUER", "ZEITP", "PRÄP"
                                    ]:  #filter entity type

                                        #DIFF_FRONT_BACK 2a
                                        if DIFF_FRONT_BACK == True:
                                            if ((x1 - x2) >= 0 and
                                                (x1 - x2) <= BACK) or (
                                                    (x1 - x2) < 0 and
                                                    (x1 - x2) >= FRONT * -1):
                                                #DETAILED_ARGS 2a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                            #DETAILED_ARGS 2b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0

                                        #DIFF_FRONT_BACK 2b
                                        else:
                                            if abs(
                                                    ents_dict[x1][1] -
                                                    ents_dict[x2][1]
                                            ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                                #DETAILED_ARGS 3a
                                                if DETAILED_ARGS == True:
                                                    merged_labels = list(
                                                        MAP_LABELS_ARG0.values(
                                                        )) + list(
                                                            MAP_LABELS_ARG1.
                                                            values()) + list(
                                                                MAP_LABELS_ARG.
                                                                values())
                                                    for label in merged_labels:
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0
                                                #DETAILED_ARGS 3b
                                                else:
                                                    for label in MAP_LABELS_STANDARD.values(
                                                    ):  #for every label
                                                        if label not in rels[(
                                                                x1, x2
                                                        )]:  #if label isn't assigned to span combination
                                                            neg += 1
                                                            rels[(
                                                                x1, x2
                                                            )][label] = 0.0  #span combination with label as key gets 0 as value
                        #VERBS_TO_OTHER 2b
                        else:
                            for x2 in span_starts:
                                #DIFF_FRONT_BACK 3a
                                if DIFF_FRONT_BACK == True:
                                    if ((x1 - x2) >= 0 and
                                        (x1 - x2) <= BACK) or (
                                            (x1 - x2) < 0 and
                                            (x1 - x2) >= FRONT * -1):
                                        #DETAILED_ARGS 4a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                    #DETAILED_ARGS 4b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                                #DIFF_FRONT_BACK 3b
                                else:
                                    if abs(
                                            ents_dict[x1][1] - ents_dict[x2][1]
                                    ) <= TOKEN_LENGTH:  #filter token distance (match with config?)
                                        #DETAILED_ARGS 5a
                                        if DETAILED_ARGS == True:
                                            merged_labels = list(
                                                MAP_LABELS_ARG0.values()
                                            ) + list(MAP_LABELS_ARG1.values(
                                            )) + list(MAP_LABELS_ARG.values())
                                            for label in merged_labels:
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0
                                        #DETAILED_ARGS 5b
                                        else:
                                            for label in MAP_LABELS_STANDARD.values(
                                            ):  #for every label
                                                if label not in rels[(
                                                        x1, x2
                                                )]:  #if label isn't assigned to span combination
                                                    neg += 1
                                                    rels[(x1, x2)][label] = 0.0

                    #print(rels)
                    doc._.rel = rels  # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}}

                    # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list)
                    if pos > 0:

                        recipe_id = example["_input_hash"]

                        if len(docs["train"]) < round(
                                train_split * length_training_data):
                            ids["train"].add(recipe_id)
                            docs["train"].append(doc)
                            count_pos["train"] += pos
                            count_all["train"] += pos + neg
                        elif len(docs["test"]) < round(
                                test_split * length_training_data):
                            ids["test"].add(recipe_id)
                            docs["test"].append(doc)
                            count_pos["test"] += pos
                            count_all["test"] += pos + neg
                        else:
                            ids["dev"].add(recipe_id)
                            docs["dev"].append(doc)
                            count_pos["dev"] += pos
                            count_all["dev"] += pos + neg

                except KeyError as e:
                    msg.fail(
                        f"Skipping doc because of key error: {e} in {example['_input_hash']}"
                    )

    msg.info(
        f"{long_rel_count} relations have been cut because tokens are too far apart."
    )

    docbin = DocBin(docs=docs["train"], store_user_data=True)
    docbin.to_disk(train_file)
    msg.info(
        f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, "
        f"{count_pos['train']}/{count_all['train']} pos instances.")

    docbin = DocBin(docs=docs["dev"], store_user_data=True)
    docbin.to_disk(dev_file)
    msg.info(
        f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, "
        f"{count_pos['dev']}/{count_all['dev']} pos instances.")

    docbin = DocBin(docs=docs["test"], store_user_data=True)
    docbin.to_disk(test_file)
    msg.info(
        f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, "
        f"{count_pos['test']}/{count_all['test']} pos instances.")
Exemplo n.º 12
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        if not text:
            return Doc(self.vocab)
        elif text.isspace():
            return Doc(self.vocab, words=[text], spaces=[False])

        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc)
        words = []
        spaces = []
        pos = []
        tags = []
        morphs = []
        deps = []
        heads = []
        lemmas = []
        offset = 0
        token_texts = [t.text for t in snlp_tokens]
        is_aligned = True
        try:
            words, spaces = self.get_words_and_spaces(token_texts, text)
        except ValueError:
            words = token_texts
            spaces = [True] * len(words)
            is_aligned = False
            warnings.warn(
                "Due to multiword token expansion or an alignment "
                "issue, the original text has been replaced by space-separated "
                "expanded tokens.",
                stacklevel=4,
            )
        offset = 0
        for i, word in enumerate(words):
            if word.isspace() and (i + offset >= len(snlp_tokens)
                                   or word != snlp_tokens[i + offset].text):
                # insert a space token
                pos.append("SPACE")
                tags.append("_SP")
                morphs.append("")
                deps.append("")
                lemmas.append(word)

                # increment any heads left of this position that point beyond
                # this position to the right (already present in heads)
                for j in range(0, len(heads)):
                    if j + heads[j] >= i:
                        heads[j] += 1

                # decrement any heads right of this position that point beyond
                # this position to the left (yet to be added from snlp_heads)
                for j in range(i + offset, len(snlp_heads)):
                    if j + snlp_heads[j] < i + offset:
                        snlp_heads[j] -= 1

                # initial space tokens are attached to the following token,
                # otherwise attach to the preceding token
                if i == 0:
                    heads.append(1)
                else:
                    heads.append(-1)

                offset -= 1
            else:
                token = snlp_tokens[i + offset]
                assert word == token.text

                pos.append(token.upos or "")
                tags.append(token.xpos or token.feats or "")
                morphs.append(token.feats or "")
                deps.append(token.deprel or "")
                heads.append(snlp_heads[i + offset])
                lemmas.append(token.lemma or "")

        doc = Doc(
            self.vocab,
            words=words,
            spaces=spaces,
            pos=pos,
            tags=tags,
            morphs=morphs,
            lemmas=lemmas,
            deps=deps,
            heads=[head + i for i, head in enumerate(heads)],
        )
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not is_aligned or not all(ents):
            warnings.warn(
                f"Can't set named entities because of multi-word token "
                f"expansion or because the character offsets don't map to "
                f"valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents

        if self.svecs is not None:
            doc.user_token_hooks["vector"] = self.token_vector
            doc.user_token_hooks["has_vector"] = self.token_has_vector
        return doc
Exemplo n.º 13
0
    def match(
        self,
        doc: Doc,
        regex_str: str,
        partial: bool = True,
        predef: bool = False,
    ) -> List[Tuple[int, int]]:
        """Returns all the regex matches within doc.

        Matches on the character level and then maps matches back
        to tokens. If a character cannot be mapped back to a token it means
        it is a space tokens are split on, which happens when regex matches
        produce leading or trailing whitespace. Confirm your regex pattern
        will not do this to avoid this issue.

        To utilize regex flags, use inline flags.

        Args:
            doc: Doc object to search over.
            regex_str: A string to be compiled to regex,
                or the key name of a predefined regex pattern.
            partial: Whether partial matches should be extended
                to existing span boundaries in doc or not, i.e.
                the regex only matches part of a token or span.
                Default is True.
            predef: Whether regex should be interpreted as a key to
                a predefined regex pattern or not. Default is False.
                The included regexes are:
                "dates"
                "times"
                "phones"
                "phones_with_exts"
                "links"
                "emails"
                "ips"
                "ipv6s"
                "prices"
                "hex_colors"
                "credit_cards"
                "btc_addresses"
                "street_addresses"
                "zip_codes"
                "po_boxes"
                "ssn_number".

        Returns:
            A list of span start index and end index pairs as tuples.

        Raises:
            TypeError: If regex_str is not a string.

        Example:
            >>> import spacy
            >>> from spaczz.regex import RegexSearcher
            >>> nlp = spacy.blank("en")
            >>> searcher = RegexSearcher()
            >>> doc = nlp.make_doc("My phone number is (555) 555-5555.")
            >>> searcher.match(doc, "phones", predef=True)
            [(4, 10)]
        """
        if isinstance(regex_str, str):
            compiled_regex = self._config.parse_regex(regex_str, predef)
        else:
            raise TypeError(f"regex_str must be a str, not {type(regex_str)}.")
        matches = []
        chars_to_tokens = map_chars_to_tokens(doc)
        for match in compiled_regex.finditer(doc.text):
            start, end = match.span()
            span = doc.char_span(start, end)
            if span:
                matches.append(span)
            else:
                if partial:
                    start_token = chars_to_tokens.get(start)
                    end_token = chars_to_tokens.get(end)
                    if start_token and end_token:
                        span = Span(doc, start_token, end_token + 1)
                        matches.append(span)
        if matches:
            return [(match.start, match.end) for match in matches]
        else:
            return []
Exemplo n.º 14
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        if not text:
            return Doc(self.vocab)
        elif text.isspace():
            return Doc(self.vocab, words=[text], spaces=[False])

        snlp_doc = self.snlp(text)
        text = snlp_doc.text
        snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        heads = []
        lemmas = []
        offset = 0
        token_texts = [t.text for t in snlp_tokens]
        is_aligned = True
        try:
            words, spaces = self.get_words_and_spaces(token_texts, text)
        except ValueError:
            words = token_texts
            spaces = [True] * len(words)
            is_aligned = False
            warnings.warn(
                "Due to multiword token expansion or an alignment "
                "issue, the original text has been replaced by space-separated "
                "expanded tokens.",
                stacklevel=4,
            )
        offset = 0
        for i, word in enumerate(words):
            if word.isspace() and word != snlp_tokens[i + offset].text:
                # insert a space token
                pos.append(self.vocab.strings.add("SPACE"))
                tags.append(self.vocab.strings.add("_SP"))
                deps.append(self.vocab.strings.add(""))
                lemmas.append(self.vocab.strings.add(word))

                # increment any heads left of this position that point beyond
                # this position to the right (already present in heads)
                for j in range(0, len(heads)):
                    if j + heads[j] >= i:
                        heads[j] += 1

                # decrement any heads right of this position that point beyond
                # this position to the left (yet to be added from snlp_heads)
                for j in range(i + offset, len(snlp_heads)):
                    if j + snlp_heads[j] < i + offset:
                        snlp_heads[j] -= 1

                # initial space tokens are attached to the following token,
                # otherwise attach to the preceding token
                if i == 0:
                    heads.append(1)
                else:
                    heads.append(-1)

                offset -= 1
            else:
                token = snlp_tokens[i + offset]
                assert word == token.text

                pos.append(self.vocab.strings.add(token.upos or ""))
                tags.append(
                    self.vocab.strings.add(token.xpos or token.feats or ""))
                deps.append(self.vocab.strings.add(token.deprel or ""))
                heads.append(snlp_heads[i + offset])
                lemmas.append(self.vocab.strings.add(token.lemma or ""))

        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not is_aligned or not all(ents):
            warnings.warn(
                f"Can't set named entities because of multi-word token "
                f"expansion or because the character offsets don't map to "
                f"valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) or any(tags):
            doc.is_tagged = True
        if any(deps) or any(heads):
            doc.is_parsed = True
        return doc
Exemplo n.º 15
0
    def __call__(self, text):
        """Convert a Stanza Doc to a spaCy Doc.

        text (unicode): The text to process.
        RETURNS (spacy.tokens.Doc): The spaCy Doc object.
        """
        snlp_doc = self.snlp(text) if text else Document("")
        text = snlp_doc.text
        tokens, heads = self.get_tokens_with_heads(snlp_doc)
        if not len(tokens):
            return Doc(self.vocab)
        words = []
        spaces = []
        pos = []
        tags = []
        deps = []
        lemmas = []
        offset = 0
        is_aligned = self.check_aligned(text, tokens)
        for i, token in enumerate(tokens):
            span = text[offset:]
            if not len(span):
                break
            while len(span) and span[0].isspace():
                # If we encounter leading whitespace, skip one character ahead
                offset += 1
                span = text[offset:]
            words.append(token.text)
            # Make sure all strings are in the vocabulary
            pos.append(self.vocab.strings.add(token.upos or ""))
            tags.append(self.vocab.strings.add(token.xpos or ""))
            deps.append(self.vocab.strings.add(token.deprel or ""))
            lemmas.append(self.vocab.strings.add(token.lemma or ""))
            offset += len(token.text)
            span = text[offset:]
            if i == len(tokens) - 1:
                spaces.append(False)
            elif not is_aligned:
                spaces.append(True)
            else:
                next_token = tokens[i + 1]
                spaces.append(not span.startswith(next_token.text))
        attrs = [POS, TAG, DEP, HEAD]
        array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64")
        doc = Doc(self.vocab, words=words,
                  spaces=spaces).from_array(attrs, array)
        ents = []
        for ent in snlp_doc.entities:
            ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
            ents.append(ent_span)
        if not all(ents):
            warnings.warn(
                f"Can't set named entities because the character offsets don't "
                f"map to valid tokens produced by the Stanza tokenizer:\n"
                f"Words: {words}\n"
                f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
                stacklevel=4,
            )
        else:
            doc.ents = ents
        # Overwrite lemmas separately to prevent them from being overwritten by spaCy
        lemma_array = numpy.array([[lemma] for lemma in lemmas],
                                  dtype="uint64")
        doc.from_array([LEMMA], lemma_array)
        if any(pos) and any(tags):
            doc.is_tagged = True
        if any(deps):
            doc.is_parsed = True
        return doc