Exemplo n.º 1
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs
    ):
        assert isinstance(value, text_type), "%r is not unicode" % value
        t = CylleneusToken(
            positions, chars, removestops=removestops, mode=mode, **kwargs
        )
        t.text = value
        t.boost = 1.0

        if keeporiginal:
            t.original = value
        if positions:
            t.pos = start_pos + 1
        if chars:
            t.startchar = start_char
            t.endchar = start_char + len(value)
        yield t
Exemplo n.º 2
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = t.text = data
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if not tokenize:
                    t.original = ""
                    for token in data["text"].iter("token"):
                        form = token.get("form")
                        if not form:
                            continue
                        t.original += f"{form}"
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
                else:
                    for sentence in data["text"].iter("sentence"):
                        sect_pos = -1
                        curr_line = None
                        for pos, token in enumerate(sentence.iter("word")):
                            if token.get("artificial", False):
                                continue

                            form = token.get("form")
                            if not form:
                                continue
                            t.text = form

                            lemma = token.get("lemma")
                            if not lemma or lemma in (
                                    "???",
                                    ".",
                                    ",",
                                    ";",
                                    "·",
                                    "punc1",
                                    "comma1",
                                    "PERIOD1",
                            ):
                                continue
                            t.lemma = lemma

                            t.morpho = agldt2wn(token.get("postag"))
                            t.morphosyntax = token.get("relation", None)
                            t.boost = 1.0

                            meta = {"meta": data["meta"].lower()}
                            divs = data["meta"].split("-")

                            refs = (token.get("cite").rsplit(
                                ":", maxsplit=1)[1].split("."))
                            for i, div in enumerate(divs):
                                meta[div] = refs[i]
                            meta["sent_id"] = sentence.get("id")
                            meta["sent_pos"] = str(int(token.get("id")))

                            if curr_line and refs[-1] > curr_line:
                                sect_pos = 0
                            else:
                                sect_pos += 1
                            curr_line = refs[-1]

                            meta["sect_pos"] = sect_pos  # ref in line
                            t.meta = meta

                            if keeporiginal:
                                t.original = f"{form}"
                            t.stopped = False
                            if positions:
                                t.pos = start_pos + pos
                            if chars:
                                t.startchar = start_char
                                original_len = len(form)

                                t.endchar = start_char + original_len
                            if self.cached:
                                self._cache.append(copy.copy(t))
                            yield t

                            start_char += len(form)
Exemplo n.º 3
0
    def __call__(self,
                 data,
                 positions=True,
                 chars=True,
                 keeporiginal=True,
                 removestops=True,
                 tokenize=True,
                 start_pos=0,
                 start_char=0,
                 mode="",
                 **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = data
                t.text = data.translate(jvmap)
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if tokenize:
                    punc = str.maketrans("", "", string.punctuation)

                    tags = data["meta"].split("-")
                    meta = {"meta": data["meta"].lower()}
                    meta.update({tag: "-" for tag in tags})

                    divs = ["div1", "div2", "div3", "div4", "div5"]

                    sect_sent = 0
                    sect_pos = 0
                    sent_id = 0
                    pos = 0

                    for el in data["text"].find("text").find("body").iter():
                        if el.tag in divs:
                            current_div_ix = divs.index(el.tag)
                            meta[tags[current_div_ix]] = el.get("n", "-")
                            sect_sent = 0
                            sect_pos = 0
                        elif el.tag in ["head", "p", "l"]:
                            sent_id += 1
                            sect_sent += 1
                            if el.text:
                                text = el.text
                            else:
                                text = "".join([
                                    subel.text + subel.tail
                                    for subel in el.iter()
                                    if subel.tag != el.tag
                                ])
                            subs = [
                                r"<note>(.*?)</note>",
                                r'<sic corr="(\w+?)">\w+?</sic>',
                                r'<reg orig="\w+?">(\w+?)</reg>',
                            ]
                            for sub in subs:
                                text = re.sub(sub, "\1", text)

                            tokens = word_tokenizer.word_tokenize(text)
                            for i, token in enumerate(tokens):
                                pos += 1
                                sect_pos += 1

                                t.text = (token.translate(
                                    punc).lower().translate(jvmap))
                                if not t.text or t.text in string.whitespace:
                                    start_char += 1
                                    continue

                                t.boost = 1.0

                                meta["sent_id"] = sent_id
                                meta["sent_pos"] = i
                                meta["sect_sent"] = sect_sent
                                meta["sect_pos"] = sect_pos

                                t.meta = copy.copy(meta)

                                if keeporiginal:
                                    t.original = token
                                t.stopped = False
                                if positions:
                                    t.pos = start_pos + pos

                                is_enclitic = False
                                for enclitic in enclitics:
                                    if token.endswith(enclitic):
                                        if enclitic == "n":
                                            t.text = (token[:-len(enclitic)] +
                                                      "s")
                                            t.startchar = start_char
                                            t.endchar = (start_char +
                                                         len(token) -
                                                         len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "ne"
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        elif enclitic == "ne":
                                            t.text = token[:-len(enclitic)]
                                            t.startchar = start_char
                                            t.endchar = start_char + (
                                                len(token) - len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "ne"
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        elif enclitic == "st":
                                            if token.endswith("ust"):
                                                t.text = token[:-len(enclitic)]
                                                t.startchar = start_char
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) -
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                                t.text = "est"
                                                t.startchar = start_char + len(
                                                    token[:-len(enclitic)])
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) +
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                            else:
                                                t.text = token[:-len(enclitic)]
                                                t.startchar = start_char
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) -
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                                t.text = "est"
                                                t.startchar = start_char + len(
                                                    token[:-len(enclitic)])
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) +
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                        elif enclitic == "'s":
                                            t.text = token + "s"
                                            t.startchar = start_char
                                            t.endchar = start_char + len(token)
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "es"
                                            t.startchar = (start_char +
                                                           len(token) + 1)
                                            t.endchar = (start_char +
                                                         len(token) +
                                                         len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        else:
                                            t.text = token[:-len(enclitic)]
                                            t.startchar = start_char
                                            t.endchar = start_char + len(
                                                token[:-len(enclitic)])
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = enclitic
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        is_enclitic = True
                                        break

                                if not is_enclitic:
                                    if chars:
                                        t.startchar = start_char
                                        original_len = len(token)
                                        t.endchar = start_char + original_len
                                    if self.cached:
                                        self._cache.append(copy.copy(t))
                                    yield t

                                start_char += len(token)
                else:
                    t.original = ""
                    for el in data["text"].find("text").find("body").iter():
                        if el.tag in ["head", "p", "l"]:
                            if el.text:
                                text = el.text
                            else:
                                text = "".join([
                                    subel.text + subel.tail
                                    for subel in el.iter()
                                    if subel.tag != el.tag
                                ])
                            subs = [
                                r"<note>(.*?)</note>",
                                r'<sic corr="(\w+?)">\w+?</sic>',
                                r'<reg orig="\w+?">(\w+?)</reg>',
                            ]
                            for sub in subs:
                                text = re.sub(sub, "\1", text)
                            t.original += text
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
Exemplo n.º 4
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = data
                t.text = normalize("NFKC", data)
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if tokenize:
                    titleStmt = data.find('.//teiHeader').find(
                        'fileDesc').find('titleStmt')
                    auth_code = f"tlg{titleStmt.find('tlgAuthor').text}"
                    work_code = f"tlg{titleStmt.find('tlgId').text}"

                    body = data.find('.//text').find('body')

                    divs = AUTHOR_TAB[auth_code]["works"][work_code]["meta"]

                    meta = {"meta": divs}
                    divv = divs.split("-")
                    for k in divv:
                        meta[k] = None

                    sect_sent = 0
                    sect_pos = 0
                    current_refs = None
                    pos = 0
                    for sentence in body.iter("sentence"):
                        refs = sentence.get("location")
                        if refs != current_refs:
                            current_refs = refs
                            sect_pos = 0
                            sect_sent = 0
                        sent_id = sentence.get("id")
                        sect_sent += 1

                        for i, ref in enumerate(refs.split(".")):
                            meta[divv[i]] = ref

                        for sent_pos, word in enumerate(sentence.iter("word")):
                            t.boost = 1.0

                            sect_pos += 1
                            pos += 1

                            lemma = word.find("lemma").get("entry", None)
                            t.lemma = normalize("NFKC", lemma)

                            meta["sent_id"] = sent_id
                            meta["sent_pos"] = word.get("id")
                            meta["sect_pos"] = str(sect_pos)
                            meta["sect_sent"] = str(sect_sent)
                            t.meta = copy.copy(meta)

                            beta = word.get("form").upper()
                            form = normalize(
                                "NFKC",
                                beta2unicode(
                                    beta +
                                    "\n" if beta.endswith("S") else beta))
                            if (t.lemma.istitle()):
                                form = form.title()
                            t.text = form

                            if keeporiginal:
                                t.original = beta
                            t.stopped = False
                            if positions:
                                t.pos = start_pos + pos

                            original_len = len(form)
                            if chars:
                                t.startchar = start_char
                                t.endchar = start_char + original_len
                            start_char += original_len

                            POS = word.find("lemma").get("POS", None)
                            analyses = [
                                analysis.get("morph", None) for analysis in
                                word.find("lemma").iter("analysis")
                            ]
                            morphos = []
                            for analysis in analyses:
                                morphos += diorisis2wn(POS, analysis)
                            t.morpho = " ".join(morphos)

                            if self.cached:
                                self._cache.append(copy.deepcopy(t))
                            yield t
                else:
                    body = data.find('.//text').find('body')

                    tokens = []
                    for sentence in body.iter("sentence"):
                        for word in sentence.iter("word"):
                            form = word.get("form")
                            if not form:
                                continue
                            else:
                                tokens.append(form)
                    t.original = t.text = " ".join(tokens)
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
Exemplo n.º 5
0
    def __call__(
        self,
        value,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix") == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = t.text = value.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    t.original = t.text = value["text"]
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(value["text"])
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    word_tokenizer = PunktLatinCharsVars()
                    stopchars = str.maketrans(
                        "",
                        "",
                        string.punctuation.replace("&", "").replace("^", "") +
                        "†“”—\n\ŕ",
                    )

                    divs = {
                        i: div.lower()
                        for i, div in enumerate(value["meta"].split("-"))
                    }

                    lines = iter(value["text"].split("\n"))
                    tpos = start_pos
                    xtitle = ytitle = ztitle = speaker = ""
                    buffer = deque()
                    for line in lines:

                        def parse_phi_line(_line):
                            result = []
                            nonlocal xtitle, ytitle, ztitle, speaker, buffer
                            try:
                                ref, text = _line.split("\t")
                            except ValueError:
                                result.append((None, None))
                            else:
                                v, w, x, y, z = ref.rstrip(".").split(".")
                                offset = 0
                                # d is a number, followed by -, t, a then possibly another number or . for a title
                                # d can be 'opinc' 'sedinc' 'dub', 'inc',
                                # c can be 'Summ'
                                if x == "t":
                                    xtitle = text.translate(stopchars).strip()
                                if y == "t":
                                    if z:
                                        ytitle = text.translate(
                                            stopchars).strip()
                                    else:
                                        speaker = text.translate(
                                            stopchars).strip()
                                    result.append((None, [text]))
                                elif z == "t":
                                    ztitle = text.translate(stopchars).strip()
                                    result.append((None, [text]))
                                elif "        {" in text:
                                    result.append((None, [text]))
                                else:
                                    temp_tokens = word_tokenizer.word_tokenize(
                                        text)
                                    if temp_tokens:
                                        if (temp_tokens[0].replace(
                                                "j", "i").replace("v",
                                                                  "u") not in
                                                proper_names.proper_names):
                                            temp_tokens[0] = temp_tokens[
                                                0].lower()

                                        if (temp_tokens[-1].endswith(".") and
                                                temp_tokens[-1] != ". . ."):
                                            final_word = temp_tokens[-1][:-1]
                                            del temp_tokens[-1]
                                            temp_tokens += [final_word, "."]

                                        if temp_tokens[-1].endswith("-"):
                                            buffer += list(
                                                parse_phi_line(next(lines)))
                                            new_ref, new_tokens = buffer.pop()
                                            merged_word = (
                                                "2&" + temp_tokens[-1][:-1] +
                                                new_tokens[0])
                                            del temp_tokens[-1]
                                            temp_tokens += [merged_word]
                                            del new_tokens[0]
                                            if new_tokens:
                                                if (new_tokens[0]
                                                        in string.punctuation):
                                                    new_token = (
                                                        f"^1{new_tokens[0]}")
                                                    del new_tokens[0]
                                                    new_tokens.insert(
                                                        0, new_token)
                                                buffer.appendleft(
                                                    (new_ref, new_tokens))

                                        for ix, token in enumerate(
                                                temp_tokens):
                                            if temp_tokens[ix] == ". . .":
                                                temp_tokens.insert(
                                                    ix + 1, "&1")
                                            if "&" in token:
                                                ppp = compound.is_ppp(
                                                    re.sub(
                                                        r"[&\d]", "", token))
                                            else:
                                                ppp = compound.is_ppp(token)
                                            if ppp:
                                                if ix == len(temp_tokens) - 1:
                                                    if not buffer:
                                                        try:
                                                            buffer += list(
                                                                parse_phi_line(
                                                                    next(lines)
                                                                ))
                                                        except StopIteration:
                                                            continue
                                                    if "&" in buffer[0][1][0]:
                                                        copula = compound.is_copula(
                                                            buffer[0][1][0]
                                                            [2:])
                                                    else:
                                                        copula = compound.is_copula(
                                                            buffer[0][1][0])
                                                else:
                                                    copula = compound.is_copula(
                                                        temp_tokens[ix + 1])

                                                if (copula and ppp[1]
                                                        == copula[2]):
                                                    (
                                                        tense,
                                                        mood,
                                                        number,
                                                        i,
                                                    ) = copula
                                                    if buffer:
                                                        token = f"{token} &2{compound.copula[tense][mood][number][i]}"
                                                    else:
                                                        token = f"{token} {compound.copula[tense][mood][number][i]}"
                                                    del temp_tokens[ix]
                                                    if buffer:
                                                        del buffer[0][1][0]
                                                    else:
                                                        del temp_tokens[ix]
                                                    temp_tokens.insert(
                                                        ix, token)
                                                    if (ix !=
                                                            len(temp_tokens) -
                                                            1):
                                                        if (temp_tokens[ix + 1]
                                                                in string.
                                                                punctuation):
                                                            new_token = f"^1{temp_tokens[ix + 1]} "
                                                            del temp_tokens[ix
                                                                            +
                                                                            1]
                                                            temp_tokens.insert(
                                                                ix + 1,
                                                                new_token,
                                                            )
                                    if buffer:
                                        for i in range(len(buffer)):
                                            result.append(buffer.pop())
                                    result.append(
                                        ((v, w, x, y, z), temp_tokens))
                            yield from result

                        result = list(parse_phi_line(line))
                        act = scene = None
                        for ref, tokens in reversed(result):
                            enjambed = False
                            if not ref and not tokens:
                                start_char += len(line) + 1
                                continue
                            elif not ref:
                                text = tokens[0].strip().strip("{}")
                                if re.match(
                                        r"[IVXLDMivxldm]+\.[IVXLDMivxldm]+",
                                        text):
                                    act, scene = text.split(".")
                                    act = str(roman_to_arabic(act))
                                    scene = str(roman_to_arabic(scene))
                                start_char += len(line.split("\t")[1]) + 1
                                continue
                            notoken = 0

                            skip = False
                            for line_pos, token in enumerate(tokens):
                                if token == "{" or token == "}":
                                    skip = not skip
                                    start_char += len(token)
                                    continue
                                if skip:
                                    speaker = token.replace("v", "u")
                                    start_char += len(token)
                                    continue

                                offset = 0
                                line_pos -= notoken

                                meta = {}
                                # extra['meta'] = value['meta'].lower()
                                # setattr(t, 'meta', value['meta'].lower())
                                for i in range(len(divs)):
                                    meta[divs[len(divs) - (i + 1)]] = ref[-(
                                        5 - (5 - (i + 1)))].strip("t")
                                    # setattr(t, divs[len(divs) - (i + 1)], ref[-(5 - (5 - (i + 1)))].strip('t'))
                                    if xtitle:
                                        if len(divs) >= 3:
                                            meta[
                                                f"{divs[len(divs) - 3]}_title"] = xtitle
                                            # setattr(t, f"{divs[len(divs)-3]}_title", xtitle)
                                    if ytitle:
                                        if len(divs) >= 2:
                                            meta[
                                                f"{divs[len(divs) - 2]}_title"] = ytitle
                                            # setattr(t, f"{divs[len(divs)-2]}_title", ytitle)
                                    if ztitle:
                                        if len(divs) >= 1:
                                            meta[
                                                f"{divs[len(divs) - 1]}_title"] = ztitle
                                            # setattr(t, f"{divs[len(divs)-1]}_title", ztitle)
                                if act:
                                    meta["act"] = act
                                if scene:
                                    meta["scene"] = scene
                                # if speaker:
                                #     t.speaker = speaker
                                t.boost = 1.0

                                pre = re.search(r"^\^(\d+?)", token)
                                if pre:
                                    start_char -= int(pre.group(1))
                                    token = re.sub(r"^\^\d+?", "", token)
                                pre = re.search(r"^&(\d+?)", token)
                                if pre:
                                    start_char += int(pre.group(1))
                                    token = re.sub(r"^&\d+?", "", token)
                                if keeporiginal:
                                    t.original = token
                                t.stopped = False
                                original_length = len(token)

                                ltoken = token.lstrip(string.punctuation)
                                ldiff = original_length - len(ltoken)
                                if ldiff != 0:
                                    token = ltoken
                                rtoken = token.rstrip(string.punctuation)
                                rdiff = len(token) - len(rtoken)
                                if rdiff != 0:
                                    token = rtoken
                                ntoken = token.translate(stopchars)
                                ndiff = len(token) - len(ntoken)
                                if ndiff:
                                    token = ntoken
                                if not re.match(
                                        r"(?:[\d]&)?[\w]+\s(?:&[\d])?[\w]+",
                                        token):
                                    token = token.replace(" ", "")
                                if not token:
                                    start_char += original_length
                                    notoken += 1
                                    continue
                                else:
                                    if positions:
                                        meta["line_pos"] = line_pos
                                        t.pos = tpos
                                    t.meta = meta

                                    if (token not in exceptions
                                            and token.lower() not in exceptions
                                            and re.sub(r"\d&|&\d", "", token)
                                            not in exceptions):
                                        if token in replacements:  # t.original
                                            for subtoken in replacements[
                                                    token]:
                                                t.text = subtoken.lower()
                                                t.startchar = start_char
                                                t.endchar = (start_char +
                                                             original_length)
                                                if mode == "index":
                                                    if self.cached:
                                                        self._cache.append(
                                                            copy.copy(t))
                                                yield t
                                            start_char += original_length
                                            tpos += 1
                                            continue

                                        if re.match(
                                                r"(?:[\d]&)?[\w]+\s(?:&[\d])?[\w]+",
                                                token,
                                        ):
                                            ppp, copula = token.split(" ")
                                            post = re.match(
                                                r"([\d])&[\w]+", ppp)
                                            if post:
                                                offset += int(post.group(1))
                                                ppp = re.sub(r"[\d]&", "", ppp)
                                                original_length -= 2
                                                enjambed = True
                                            t.text = ppp.lower()
                                            t.startchar = start_char
                                            t.endchar = (start_char +
                                                         len(ppp) + offset)
                                            if mode == "index":
                                                if self.cached:
                                                    self._cache.append(
                                                        copy.copy(t))
                                            yield t
                                            pre = re.search(r"&(\d+?)", copula)
                                            if pre:
                                                start_char += int(pre.group(1))
                                                copula = re.sub(
                                                    r"&\d+?", "", copula)
                                                original_length -= 2
                                                enjambed = True
                                            t.text = copula.lower()
                                            t.startchar = (start_char +
                                                           len(ppp) + 1)
                                            t.endchar = (start_char +
                                                         len(ppp) + 1 +
                                                         len(copula))
                                            if mode == "index":
                                                if self.cached:
                                                    self._cache.append(
                                                        copy.copy(t))
                                            yield t
                                            start_char += original_length
                                            tpos += 1
                                            continue
                                        else:
                                            post = re.match(
                                                r"([\d])&[\w]+", token)
                                            if post:
                                                offset += int(post.group(1))
                                                token = re.sub(
                                                    r"[\d]&", "", token)
                                                original_length -= 2
                                                enjambed = True
                                            else:
                                                offset = 0

                                        is_enclitic = False
                                        for enclitic in enclitics:
                                            if token.lower().endswith(
                                                    enclitic):
                                                is_enclitic = True
                                                if enclitic == "ne":
                                                    t.text = (
                                                        token[:-len(enclitic)]
                                                    ).lower()
                                                    t.startchar = start_char
                                                    t.endchar = start_char + (
                                                        len(token) -
                                                        len(enclitic))
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                    t.text = "ne"
                                                    t.startchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]) +
                                                        offset)
                                                    t.endchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]) +
                                                        len(enclitic) + offset)
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                elif enclitic == "n":
                                                    t.text = (
                                                        token[:-len(enclitic)]
                                                        + "s").lower()
                                                    t.startchar = start_char
                                                    t.endchar = (
                                                        start_char +
                                                        (len(token) + 1) -
                                                        len(enclitic))
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                    t.text = "ne"
                                                    t.startchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]) +
                                                        offset)
                                                    t.endchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]) +
                                                        len(enclitic) + offset)
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                elif enclitic == "st":
                                                    if token.endswith("ust"):
                                                        t.text = (
                                                            token[:-len(
                                                                enclitic) +
                                                                  1]).lower()
                                                        t.startchar = (
                                                            start_char)
                                                        t.endchar = (
                                                            start_char +
                                                            len(token[:-len(
                                                                enclitic) + 1])
                                                            - len(enclitic))
                                                        if mode == "index":
                                                            if self.cached:
                                                                self._cache.append(
                                                                    copy.copy(
                                                                        t))
                                                        yield t
                                                        t.text = "est"
                                                        t.startchar = (
                                                            start_char +
                                                            len(token[:-len(
                                                                enclitic) + 1])
                                                            + offset)
                                                        t.endchar = (
                                                            start_char +
                                                            len(token[:-len(
                                                                enclitic) + 1])
                                                            + len(enclitic) +
                                                            offset)
                                                        if mode == "index":
                                                            if self.cached:
                                                                self._cache.append(
                                                                    copy.copy(
                                                                        t))
                                                        yield t
                                                    else:
                                                        t.text = (token[:-len(
                                                            enclitic)]
                                                                  ).lower()
                                                        t.startchar = (
                                                            start_char)
                                                        t.endchar = (
                                                            start_char +
                                                            len(token[:-len(
                                                                enclitic)]) -
                                                            len(enclitic))
                                                        if mode == "index":
                                                            if self.cached:
                                                                self._cache.append(
                                                                    copy.copy(
                                                                        t))
                                                        yield t
                                                        t.text = "est"
                                                        t.startchar = (
                                                            start_char +
                                                            len(token[:-len(
                                                                enclitic)]) +
                                                            offset)
                                                        t.endchar = (
                                                            start_char +
                                                            len(token[:-len(
                                                                enclitic)]) +
                                                            len(enclitic) +
                                                            offset)
                                                        if mode == "index":
                                                            if self.cached:
                                                                self._cache.append(
                                                                    copy.copy(
                                                                        t))
                                                        yield t
                                                elif enclitic == "'s":
                                                    t.text = (token.lower() +
                                                              "s")
                                                    t.startchar = start_char
                                                    t.endchar = (start_char +
                                                                 len(token))
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                    t.text = "es"
                                                    t.startchar = (start_char +
                                                                   len(token) +
                                                                   1)
                                                    t.endchar = (start_char +
                                                                 len(token) +
                                                                 len(enclitic))
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                else:
                                                    t.text = (
                                                        token[:-len(enclitic)]
                                                    ).lower()
                                                    t.startchar = start_char
                                                    t.endchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]))
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                    t.text = enclitic
                                                    t.startchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]) +
                                                        offset)
                                                    t.endchar = (
                                                        start_char +
                                                        len(token[:-len(
                                                            enclitic)]) +
                                                        len(enclitic) + offset)
                                                    if mode == "index":
                                                        if self.cached:
                                                            self._cache.append(
                                                                copy.copy(t))
                                                    yield t
                                                break
                                    else:
                                        is_enclitic = False
                                        post = re.match(r"([\d])&[\w]+", token)
                                        if post:
                                            offset += int(post.group(1))
                                            token = re.sub(r"[\d]&", "", token)
                                            original_length -= 2
                                            enjambed = True
                                    if not is_enclitic:
                                        t.text = token
                                        if chars:
                                            t.startchar = start_char + ldiff
                                            t.endchar = (start_char +
                                                         original_length -
                                                         rdiff + offset)
                                        if mode == "index":
                                            if self.cached:
                                                self._cache.append(
                                                    copy.copy(t))
                                        yield t
                                        tpos += 1
                                    if enjambed:
                                        start_char += original_length + offset
                                    else:
                                        start_char += original_length
                            start_char += 1  # \n
Exemplo n.º 6
0
    def __call__(self,
                 value,
                 positions=True,
                 chars=True,
                 keeporiginal=True,
                 removestops=True,
                 tokenize=True,
                 start_pos=0,
                 start_char=0,
                 mode="",
                 **kwargs):
        if (kwargs.get("docix", None) == self._docix
                and self._cache is not None):
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)
            if t.mode == "query":
                t.original = t.text = value
                yield t
            else:
                if not tokenize:
                    text = "\n".join([el for el in flatten(value["text"])])
                    t.original = t.text = text
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(text)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    tokenizer = word_tokenize
                    stopchars = string.punctuation

                    doc = value["text"]
                    divs = [
                        cref.get("n") for cref in reversed(
                            doc.findall(
                                ".//{http://www.tei-c.org/ns/1.0}cRefPattern"))
                    ]

                    ss = doc.iter("{http://www.tei-c.org/ns/1.0}s")

                    sect_sent = 0
                    sect_pos = 0
                    for n, s in enumerate(ss):
                        meta = {
                            "meta":
                            "-".join(divs),
                            "sent_id":
                            s.get("{http://www.w3.org/XML/1998/namespace}id"),
                            "sect_sent":
                            sect_sent,
                            "alignment":
                            s.get("n"),
                        }

                        el = s
                        j = 0
                        while el is not None:
                            if el.getparent() is not None:
                                if (el.getparent().get("type",
                                                       None) == "textpart"):
                                    j -= 1
                                    if (divs[j] in meta
                                            and el.getparent().get("n") !=
                                            meta[divs[j]]):
                                        sect_sent = 0
                                        sect_pos = 0
                                    meta[divs[j]] = el.getparent().get("n")
                            el = el.getparent()

                        text = stringify(s)

                        sent_pos = 0
                        for i, token in enumerate(tokenizer(text)):
                            if token == " " or token in stopchars:
                                sect_pos += 1
                                sent_pos += 1
                                continue

                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token
                            t.stopped = False

                            meta["sent_pos"] = sent_pos
                            meta["sect_pos"] = sect_pos
                            if positions:
                                t.pos = start_pos + sect_pos

                            length = len(token)

                            token = token.strip()
                            if not token:
                                start_char += length
                                continue

                            t.meta = copy.deepcopy(meta)

                            t.text = token
                            if chars:
                                t.startchar = start_char
                                t.endchar = start_char + length
                            if mode == "index":
                                self._cache.append(copy.deepcopy(t))
                            yield t

                            start_char += length
                            sect_pos += 1
                            sent_pos += 1
                        sect_sent += 1
                        start_char += 1
Exemplo n.º 7
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(
                positions, chars, removestops=removestops, mode=mode, **kwargs
            )

            if t.mode == "query":
                t.original = data
                t.text = data.translate(jvmap)
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if not tokenize:
                    t.original = ""
                    for el in data["text"].find("text").find("body").iter():
                        if el.tag in ["head", "p", "l"]:
                            if not el.text:
                                text = "".join(
                                    [
                                        subel.text + subel.tail
                                        for subel in el.iter()
                                        if subel.tag != el.tag
                                    ]
                                )
                            else:
                                text = el.text
                            subs = [
                                r"<note>(.*?)</note>",
                                r'<sic corr="(\w+?)">\w+?</sic>',
                                r'<reg orig="\w+?">(\w+?)</reg>',
                            ]
                            for sub in subs:
                                text = re.sub(sub, "\1", text)
                            t.original += text
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
                else:
                    punc = str.maketrans("", "", string.punctuation)

                    tags = data["meta"].lower()
                    meta = {"meta": tags}
                    if tags != "-":
                        divs = data["meta"].split("-")
                        meta.update({div: "-" for div in divs})

                    sect_sent = 0
                    sect_pos = 0
                    sent_id = 0
                    pos = 0

                    for el in (
                        data["text"]
                            .find("{http://www.tei-c.org/ns/1.0}text")
                            .find("{http://www.tei-c.org/ns/1.0}body")
                            .findall(".//{http://www.tei-c.org/ns/1.0}*")
                    ):
                        if el.tag == "{http://www.tei-c.org/ns/1.0}milestone":
                            meta[el.get("unit")] = el.get("n", "-")
                            sect_sent = 0
                            sect_pos = 0
                        elif (
                            el.tag == "{http://www.tei-c.org/ns/1.0}div"
                            and el.get("n")
                        ):
                            meta[el.get("type")] = el.get("n", "-")
                            sect_sent = 0
                            sect_pos = 0

                        if not el.text:
                            text = el.tail if el.tail else ""
                        else:
                            text = el.text + (el.tail if el.tail else "")
                        subs = [
                            (r"<supplied>(.*?)</supplied>", "\1"),
                            (r'<quote type="\w+?">(.+?)</quote>', "\1"),
                            (r'<hi rend="\w+?">(.+?)</hi>', "\1"),
                            (r'<g ref="\w+?">(.+?)</g>', "\1"),
                            (
                                r'<foreign xml:lang="\w+?">(\w+?)</foreign>',
                                "\1",
                            ),
                            (r"<del>.+?</del>", ""),
                        ]
                        for old, new in subs:
                            text = re.sub(old, new, text)

                        if text:
                            for sentence in sent_tokenizer.tokenize(text):
                                sent_id += 1
                                sect_sent += 1

                                sentence = sentence.strip()
                                replacements = [(r"\n", ""), (r"\s+", " ")]
                                for old, new in replacements:
                                    sentence = re.sub(old, new, sentence)

                                sent_pos = 0
                                tokens = word_tokenizer.word_tokenize(sentence)

                                for token in tokens:
                                    token = (
                                        token.translate(punc)
                                            .lower()
                                            .translate(jvmap)
                                            .strip()
                                    )

                                    if not token or token in string.whitespace:
                                        start_char += 1
                                        continue
                                    else:
                                        pos += 1
                                        sect_pos += 1
                                        sent_pos += 1

                                        t.text = token
                                        t.boost = 1.0

                                        meta["sent_id"] = sent_id
                                        meta["sent_pos"] = sent_pos
                                        meta["sect_sent"] = sect_sent
                                        meta["sect_pos"] = sect_pos

                                        t.meta = copy.copy(meta)

                                        if keeporiginal:
                                            t.original = token
                                        t.stopped = False
                                        if positions:
                                            t.pos = start_pos + pos

                                        is_enclitic = False
                                        for enclitic in enclitics:
                                            if (
                                                token.endswith(enclitic)
                                                and token
                                                not in latin_exceptions
                                            ):
                                                if enclitic == "ne":
                                                    t.text = token[
                                                             : -len(enclitic)
                                                             ]
                                                    t.startchar = start_char
                                                    t.endchar = start_char + (
                                                        len(token)
                                                        - len(enclitic)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                    t.text = "ne"
                                                    t.startchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                    )
                                                    t.endchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                        + len(enclitic)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                elif enclitic == "n":
                                                    t.text = (
                                                        token[: -len(enclitic)]
                                                        + "s"
                                                    )
                                                    t.startchar = start_char
                                                    t.endchar = (
                                                        start_char
                                                        + len(token)
                                                        - len(enclitic)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                    t.text = "ne"
                                                    t.startchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                    )
                                                    t.endchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                        + len(enclitic)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                elif enclitic == "st":
                                                    if token.endswith("ust"):
                                                        t.text = token[
                                                                 : -len(enclitic)
                                                                 ]
                                                        t.startchar = (
                                                            start_char
                                                        )
                                                        t.endchar = (
                                                            start_char
                                                            + len(
                                                            token[
                                                            : -len(
                                                                enclitic
                                                            )
                                                            ]
                                                        )
                                                            - len(enclitic)
                                                        )
                                                        if mode == "index":
                                                            self._cache.append(
                                                                copy.deepcopy(
                                                                    t
                                                                )
                                                            )
                                                        yield t
                                                        t.text = "est"
                                                        t.startchar = (
                                                            start_char
                                                            + len(
                                                            token[
                                                            : -len(
                                                                enclitic
                                                            )
                                                            ]
                                                        )
                                                        )
                                                        t.endchar = (
                                                            start_char
                                                            + len(
                                                            token[
                                                            : -len(
                                                                enclitic
                                                            )
                                                            ]
                                                        )
                                                            + len(enclitic)
                                                        )
                                                        if mode == "index":
                                                            self._cache.append(
                                                                copy.deepcopy(
                                                                    t
                                                                )
                                                            )
                                                        yield t
                                                    else:
                                                        t.text = token[
                                                                 : -len(enclitic)
                                                                 ]
                                                        t.startchar = (
                                                            start_char
                                                        )
                                                        t.endchar = (
                                                            start_char
                                                            + len(
                                                            token[
                                                            : -len(
                                                                enclitic
                                                            )
                                                            ]
                                                        )
                                                            - len(enclitic)
                                                        )
                                                        if mode == "index":
                                                            self._cache.append(
                                                                copy.deepcopy(
                                                                    t
                                                                )
                                                            )
                                                        yield t
                                                        t.text = "est"
                                                        t.startchar = (
                                                            start_char
                                                            + len(
                                                            token[
                                                            : -len(
                                                                enclitic
                                                            )
                                                            ]
                                                        )
                                                        )
                                                        t.endchar = (
                                                            start_char
                                                            + len(
                                                            token[
                                                            : -len(
                                                                enclitic
                                                            )
                                                            ]
                                                        )
                                                            + len(enclitic)
                                                        )
                                                        if mode == "index":
                                                            self._cache.append(
                                                                copy.deepcopy(
                                                                    t
                                                                )
                                                            )
                                                        yield t
                                                elif enclitic == "'s":
                                                    t.text = token + "s"
                                                    t.startchar = start_char
                                                    t.endchar = (
                                                        start_char + len(token)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                    t.text = "es"
                                                    t.startchar = (
                                                        start_char
                                                        + len(token)
                                                        + 1
                                                    )
                                                    t.endchar = (
                                                        start_char
                                                        + len(token)
                                                        + len(enclitic)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                else:
                                                    t.text = token[
                                                             : -len(enclitic)
                                                             ]
                                                    t.startchar = start_char
                                                    t.endchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                    t.text = enclitic
                                                    t.startchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                    )
                                                    t.endchar = (
                                                        start_char
                                                        + len(
                                                        token[
                                                        : -len(
                                                            enclitic
                                                        )
                                                        ]
                                                    )
                                                        + len(enclitic)
                                                    )
                                                    if mode == "index":
                                                        self._cache.append(
                                                            copy.deepcopy(t)
                                                        )
                                                    yield t
                                                is_enclitic = True
                                                break

                                        if not is_enclitic:
                                            original_len = len(token)
                                            if chars:
                                                t.startchar = start_char
                                                t.endchar = (
                                                    start_char + original_len
                                                )
                                            if self.cached:
                                                self._cache.append(
                                                    copy.copy(t)
                                                )
                                            yield t

                                        start_char += len(token)
Exemplo n.º 8
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = data
                t.text = data.translate(jvmap)
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if tokenize:
                    for sentence in data["text"].iter("sentence"):
                        for pos, token in enumerate(sentence.iter("word")):
                            if token.get("artificial", False):
                                continue
                            form = token.get("form")
                            if form:
                                form = form.replace(" ", " ").replace(" ", " ")
                                form = re.sub(r"\.([^ ]|^$)", r". \1", form)
                            else:
                                continue
                            lemma = token.get("lemma", None)
                            if not lemma or lemma in (
                                    ".",
                                    ",",
                                    "punc1",
                                    "comma1",
                                    "PERIOD1",
                            ):
                                continue
                            t.lemma = lemma.strip("0123456789")
                            t.morpho = agldt2wn(token.get("postag"))
                            t.morphosyntax = token.get("relation", None)
                            t.boost = 1.0

                            meta = {"meta": data["meta"].lower()}
                            divs = data["meta"].split("-")
                            for i, div in enumerate(divs):
                                if len(divs) <= 2 or div != "line":
                                    meta[div] = sentence.get("subdoc").split(
                                        ".")[i]
                            meta["sent_id"] = sentence.get("id")
                            meta["sent_pos"] = token.get("id")
                            t.meta = meta

                            if keeporiginal:
                                t.original = f"{form}"
                            t.stopped = False
                            if positions:
                                t.pos = copy.copy(start_pos + pos)
                            original_len = len(form)

                            if (form.istitle() and pos == 0
                                    and not t.lemma.istitle()):
                                form = form.lower()
                            t.text = form
                            if chars:
                                t.startchar = copy.copy(start_char)
                                t.endchar = copy.copy(start_char +
                                                      original_len)
                            if self.cached:
                                self._cache.append(copy.deepcopy(t))
                            yield t

                            if form in editorial:
                                t.text = editorial[form]
                                if self.cached:
                                    self._cache.append(copy.copy(t))
                                yield t
                            start_char += len(form)
                else:
                    t.original = ""
                    for token in data.iter("token"):
                        form = token.get("form")
                        if not form:
                            continue
                        t.original += f"{form}"
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
Exemplo n.º 9
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(
                positions, chars, removestops=removestops, mode=mode, **kwargs
            )

            if t.mode == "query":
                t.original = data
                t.text = data.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    t.original = ""
                    for token in data.iter("token"):
                        form = token.get("form")
                        if not form:
                            continue
                        after = token.get("presentation-after", "")
                        before = token.get("presentation-before", "")
                        t.original += f"{before}{form}{after}"
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    for sentence in data["text"].iter("sentence"):
                        for pos, token in enumerate(sentence.iter("token")):
                            form = token.get("form")
                            if not form:
                                continue
                            else:
                                form = form.replace(" ", " ").replace(" ", " ")
                                form = re.sub(r"\.([^ ]|^$)", r". \1", form)
                            t.lemma = token.get("lemma")
                            t.morpho = proiel2wn(
                                token.get("part-of-speech"),
                                token.get("morphology"),
                            )
                            t.morphosyntax = token.get("relation", None)
                            t.boost = 1.0

                            meta = {"meta": data["meta"].lower()}
                            for i, div in enumerate(data["meta"].split("-")):
                                meta[div] = token.get("citation-part").split(
                                    "."
                                )[i]
                            meta["sent_id"] = sentence.get("id")
                            meta["sent_pos"] = token.get("id")
                            t.meta = meta

                            before = token.get("presentation-before", "")
                            after = token.get("presentation-after", "")

                            if keeporiginal:
                                t.original = f"{before}{form}{after}"
                            t.stopped = False
                            if positions:
                                t.pos = start_pos + pos
                            original_len = len(form)

                            if (
                                form.istitle()
                                and pos == 0
                                and not t.lemma.istitle()
                            ):
                                form = form.lower()
                            t.text = form
                            if chars:
                                t.startchar = start_char + len(before)
                                t.endchar = (
                                    start_char + len(before) + original_len
                                )
                            self._cache.append(copy.deepcopy(t))
                            yield t

                            if form in editorial:
                                t.text = editorial[form]
                                self._cache.append(copy.deepcopy(t))
                                yield t
                            start_char += len(before) + len(form) + len(after)
Exemplo n.º 10
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        tokenize=True,
        mode="",
        **kwargs
    ):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = CylleneusToken(
            positions, chars, removestops=removestops, mode=mode, **kwargs
        )
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        elif not self.gaps:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(self.expression.finditer(value)):
                t.text = match.group(0)
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + match.start()
                    t.endchar = start_char + match.end()
                yield t
        else:
            # When gaps=True, iterate through the matches and
            # yield the text between them.
            prevend = 0
            pos = start_pos
            for match in self.expression.finditer(value):
                start = prevend
                end = match.start()
                text = value[start:end]
                if text:
                    t.text = text
                    t.boost = 1.0
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                        pos += 1
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t

                prevend = match.end()

            # If the last "gap" was before the end of the text,
            # yield the last bit of text as a final token.
            if prevend < len(value):
                t.text = value[prevend:]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = prevend
                    t.endchar = len(value)
                yield t
Exemplo n.º 11
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        tokenize=True,
        mode="",
        **kwargs
    ):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = CylleneusToken(
            positions, chars, removestops=removestops, mode=mode, **kwargs
        )
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            text = u("")
            charmap = self.charmap
            pos = start_pos
            startchar = currentchar = start_char
            for char in value:
                tchar = charmap[ord(char)]
                if tchar:
                    text += tchar
                else:
                    if currentchar > startchar:
                        t.text = text
                        t.boost = 1.0
                        if keeporiginal:
                            t.original = t.text
                        if positions:
                            t.pos = pos
                            pos += 1
                        if chars:
                            t.startchar = startchar
                            t.endchar = currentchar
                        yield t
                    startchar = currentchar + 1
                    text = u("")

                currentchar += 1

            if currentchar > startchar:
                t.text = value[startchar:currentchar]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = startchar
                    t.endchar = currentchar
                yield t
Exemplo n.º 12
0
    def __call__(self,
                 value: dict,
                 positions=False,
                 chars=False,
                 keeporiginal=True,
                 removestops=True,
                 tokenize=True,
                 start_pos=0,
                 start_char=0,
                 mode="",
                 **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = value
                t.text = value.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    t.original = t.text = "\n".join(
                        [el for el in value["text"]])
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(value["text"])
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    punctuation = str.maketrans("", "", string.punctuation)
                    editorial = str.maketrans("", "", "[{(<>)}]")
                    added = re.compile(r"(\s?[<(][\w .]+[>)]\s?)")

                    t.boost = 1.0
                    t.pos = t.startchar = t.endchar = 0

                    sect_sent = 0  # sentence count within passage
                    sent_id = "0001"
                    sect_pos = 0  # word pos within passage
                    sent_pos = 0  # word pos within sentence
                    current_refs = tuple(["0"] * len(value["meta"]))
                    nflag = None
                    for pos, line in enumerate(value["text"]):
                        t.pos = pos

                        parsed = parse_bpn(line)

                        if not parsed:
                            continue

                        if int(parsed["sent_id"]) > int(sent_id):
                            sent_pos = 0
                            sent_id = parsed["sent_id"]
                            if (tuple([
                                    alnum(i) for i in parsed["refs"].split(",")
                            ]) > current_refs):
                                sect_sent = 1
                                sect_pos = 0
                            else:
                                sect_sent += 1

                        if keeporiginal:
                            if added.search(parsed["form"]):
                                t.original = added.sub("", parsed["form"])
                            else:
                                t.original = parsed["form"]
                        t.stopped = False

                        if parsed["form_code"] in "&+":
                            if parsed["lemma"] != "#":
                                if parsed["lemma"] == "_SVM":
                                    t.morpho = None
                                    t.lemma = parsed["lemma"]
                                    t.lemma_n = parsed["lemma_n"]
                                    t.original = added.sub("", parsed["form"])
                                    t.text = parsed["form"].translate(
                                        editorial)
                                else:
                                    form = parsed["form"]
                                    t.morpho = parsed["morpho"]

                                    if " " in form:
                                        t.original = added.sub("", form)
                                        text = form.translate(editorial)
                                    else:
                                        t.original = form
                                        text = form
                                    t.lemma = parsed["lemma"]
                                    t.lemma_n = parsed["lemma_n"]
                                    if added.search(parsed["form"]):
                                        t.original = added.sub(
                                            "", parsed["form"])
                                    t.text = text.translate(editorial)
                                    nflag = False
                            else:
                                # could be a Greek form, do we index it?
                                t.morpho = ""
                                t.lemma = ""
                                t.lemma_n = ""
                                t.original = added.sub("", parsed["form"])
                                t.text = parsed["form"].translate(editorial)
                        elif parsed["form_code"] == "@":  # combined forms
                            if parsed["lemma"] != "#":
                                t.lemma = parsed["lemma"]
                                t.lemma_n = parsed["lemma_n"]
                                t.text = parsed["form"].translate(editorial)
                                t.morpho = parsed["morpho"]
                                if nflag:
                                    sect_pos -= 1
                                    sent_pos -= 1
                                else:
                                    nflag = True
                            else:
                                sent_pos += 1
                                sect_pos += 1
                                continue
                        elif parsed["form_code"] == "=":  # que
                            t.text = parsed["form"].translate(editorial)
                            t.lemma = parsed["lemma"]
                            t.lemma_n = parsed["lemma_n"]
                            t.morpho = parsed["morpho"]
                            sent_pos -= 1
                            sect_pos -= 1
                            nflag = False
                        meta = {"meta": value["meta"].lower()}
                        tags = value["meta"].split("-")
                        divs = {i: div.lower() for i, div in enumerate(tags)}
                        refs = tuple([
                            ref.translate(punctuation)
                            for ref in parsed["refs"].strip().split(",")
                        ])
                        for i in range(len(divs)):
                            meta[divs[i]] = refs[i]

                        current_refs = refs

                        t.morphosyntax = parsed["subord"]

                        meta["sect_sent"] = str(sect_sent)
                        meta["sect_pos"] = str(sect_pos)
                        meta["sent_id"] = parsed["sent_id"]
                        meta["sent_pos"] = str(sent_pos)
                        t.meta = meta
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)

                        if t.text != t.original:
                            tc = copy.deepcopy(t)
                            tc.text = t.original
                            yield tc

                        yield t
                        sent_pos += 1
                        sect_pos += 1
                        start_char += len(t.original) + 1
Exemplo n.º 13
0
    def __call__(
        self,
        value,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = t.text = value.translate(jvmap)
                yield t
            elif t.mode == "index":
                if not tokenize:
                    t.original = t.text = "\n".join(
                        [el for el in flatten(value["text"])])
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    tokenizer = PunktLatinCharsVars()
                    stopchars = str.maketrans("", "",
                                              string.punctuation + "“”—\n")

                    divs = {
                        i: div.lower()
                        for i, div in enumerate(value["meta"].split("-"))
                    }

                    sect_sent = 0
                    prev_sect = 0
                    sect_pos = 0
                    for i, (path, text) in enumerate(
                            nested_dict_iter(value["text"])):
                        sent_id = i
                        if len(path) >= 2 and int(path[-2]) > prev_sect:
                            sect_sent = 0
                            sect_pos = 0
                            prev_sect = int(path[-2])
                        tokens = []

                        temp_tokens = tokenizer.word_tokenize(text)
                        if temp_tokens:
                            if (temp_tokens[0].replace("j",
                                                       "i").replace("v", "u")
                                    not in proper_names.proper_names):
                                temp_tokens[0] = temp_tokens[0]

                            for ix, token in enumerate(temp_tokens):
                                ppp = compound.is_ppp(token)
                                if ppp and ix < len(temp_tokens) - 2:
                                    copula = compound.is_copula(
                                        temp_tokens[ix + 2])  # whitespace
                                    if copula and ppp[1] == copula[2]:
                                        tense, mood, number, i = copula
                                        token = f"{token} {compound.copula[tense][mood][number][i]}"
                                        del temp_tokens[ix + 1:ix + 3]
                                        tokens.insert(ix, token)
                                    else:
                                        tokens.append(token)
                                else:
                                    tokens.append(token)

                        pos = 0
                        sent_pos = 0
                        for token in tokens:
                            meta = {"meta": value["meta"].lower()}
                            for i in range(len(divs)):
                                meta[divs[i]] = str(int(path[i]) + 1)

                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token
                            t.stopped = False
                            token = convert_diphthongs(
                                strip_diacritics(token)).translate(jvmap)

                            if (token in (" ", "\n") or token in punctuation
                                    or token in stopchars):
                                pos -= 1
                            else:
                                pos += 2
                            if positions:
                                t.pos = start_pos + pos
                            original_length = len(token)

                            token = token.strip()
                            ltoken = token.lstrip(string.punctuation)
                            ldiff = original_length - len(ltoken)
                            if ldiff != 0:
                                token = ltoken
                            rtoken = token.rstrip(string.punctuation)
                            rdiff = len(token) - len(rtoken)
                            if rdiff != 0:
                                token = rtoken
                            ntoken = token.translate(stopchars)
                            ndiff = len(token) - len(ntoken)
                            if ndiff:
                                token = ntoken
                            if not token:
                                start_char += original_length
                                continue

                            meta["sect_sent"] = sect_sent
                            meta["sect_pos"] = sect_pos
                            meta["sent_id"] = sent_id
                            meta["sent_pos"] = sent_pos
                            t.meta = meta

                            is_enclitic = False
                            if token not in exceptions:
                                if t.original in replacements:
                                    for subtoken in replacements[t.original]:
                                        t.text = subtoken
                                        t.startchar = start_char
                                        t.endchar = (start_char +
                                                     original_length)
                                        if mode == "index":
                                            self._cache.append(
                                                copy.deepcopy(t))
                                        yield t
                                    start_char += original_length
                                    continue

                                if re.match(r"(?:\w+) (?:\w+)", token):
                                    ppp, copula = token.split(" ")
                                    t.text = ppp
                                    t.startchar = start_char
                                    t.endchar = start_char + len(ppp) + 1
                                    if mode == "index":
                                        self._cache.append(copy.deepcopy(t))
                                    yield t
                                    t.text = copula
                                    t.startchar = start_char + len(ppp)
                                    t.endchar = (start_char + len(ppp) +
                                                 len(copula))
                                    if mode == "index":
                                        self._cache.append(copy.deepcopy(t))
                                    yield t
                                    start_char += original_length
                                    continue

                                for enclitic in enclitics:
                                    if token.endswith(enclitic):
                                        if enclitic == "ne":
                                            t.text = token[:-len(enclitic)]
                                            t.startchar = start_char
                                            t.endchar = start_char + (
                                                len(token) - len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "ne"
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        elif enclitic == "n":
                                            t.text = (token[:-len(enclitic)] +
                                                      "s")
                                            t.startchar = start_char
                                            t.endchar = (start_char +
                                                         len(token) -
                                                         len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "ne"
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        elif enclitic == "st":
                                            if token.endswith("ust"):
                                                t.text = token[:-len(enclitic)]
                                                t.startchar = start_char
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) -
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                                t.text = "est"
                                                t.startchar = start_char + len(
                                                    token[:-len(enclitic)])
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) +
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                            else:
                                                t.text = token[:-len(enclitic)]
                                                t.startchar = start_char
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) -
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                                t.text = "est"
                                                t.startchar = start_char + len(
                                                    token[:-len(enclitic)])
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) +
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                        elif enclitic == "'s":
                                            t.text = token + "s"
                                            t.startchar = start_char
                                            t.endchar = start_char + len(token)
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "es"
                                            t.startchar = (start_char +
                                                           len(token) + 1)
                                            t.endchar = (start_char +
                                                         len(token) +
                                                         len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        else:
                                            t.text = token[:-len(enclitic)]
                                            t.startchar = start_char
                                            t.endchar = start_char + len(
                                                token[:-len(enclitic)])
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = enclitic
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        is_enclitic = True
                                        break
                            if not is_enclitic:
                                t.text = token
                                if chars:
                                    t.startchar = start_char + ldiff
                                    t.endchar = (start_char + original_length -
                                                 rdiff)  # - ndiff - rdiff
                                if mode == "index":
                                    self._cache.append(copy.deepcopy(t))
                                yield t
                            start_char += original_length
                            sent_pos += 1
                            sect_pos += 1
                        start_char += 1
Exemplo n.º 14
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(
                positions, chars, removestops=removestops, mode=mode, **kwargs
            )

            if t.mode == "query":
                t.text = t.original = value
                yield t
            else:
                if not tokenize:
                    lines = []
                    for line in value["text"]:
                        line = re.sub(r"\t+", "\t", line.strip())
                        if line and line.startswith("# text_line"):
                            text = line.split("# text_line: ")[1]
                            lines.append(text)
                    t.original = t.text = "\n".join([line for line in lines])
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.text)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    t.boost = 1.0
                    t.pos = t.startchar = t.endchar = 0

                    meta = {
                        "text":                 None,  # work title
                        "text_id":              None,
                        "chapter":              None,  # reference
                        "chapter_id":           None,
                        "text_line":            None,  # the text
                        "text_line_id":         None,
                        "text_line_counter":    None,  # line number
                        "text_line_subcounter": None,  # token number
                    }

                    sect_pos = 0
                    sent_pos = 0
                    for line in value["text"]:
                        line = line.strip()
                        if line:
                            if line.startswith("#"):
                                try:
                                    label, value = line.split(":", maxsplit=1)
                                except ValueError:
                                    continue
                                label = label.split(" ", maxsplit=1)[1].strip()
                                value = value.strip()
                                meta[label] = (
                                    value
                                    if not value.isnumeric()
                                    else int(value)
                                )

                                if label in [
                                    "text_line_counter",
                                    "text_line_subcounter",
                                ]:
                                    sent_pos = 0
                            else:
                                try:
                                    (
                                        ID,
                                        FORM,
                                        LEMMA,
                                        UPOS,
                                        XPOS,
                                        MORPHO,
                                        _,
                                        _,
                                        _,
                                        _,
                                        LEMMA_ID,
                                        PADA,
                                        SEM,
                                    ) = line.split("\t")
                                except ValueError:
                                    try:
                                        (
                                            ID,
                                            FORM,
                                            LEMMA,
                                            _,
                                            XPOS,
                                            _,
                                            _,
                                            _,
                                            _,
                                            LEMMA_ID,
                                            _,
                                            _,
                                        ) = line.split("\t")
                                    except ValueError:
                                        try:
                                            (
                                                ID,
                                                FORM,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                            ) = line.split("\t")
                                        except ValueError:
                                            continue
                                        else:
                                            t.original = FORM
                                            sect_pos += 1
                                            sent_pos += 1
                                            t.pos = sent_pos
                                            continue
                                    else:
                                        if FORM == "_":
                                            t.text = t.original
                                        else:
                                            sect_pos += 1
                                            sent_pos += 1

                                            t.text = FORM
                                            t.original = FORM
                                            t.pos = sent_pos
                                        t.lemma = LEMMA
                                        t.dcs_id = LEMMA_ID
                                        t.morphosyntax = XPOS
                                        t.morpho = None
                                        t.synset = None

                                        t.meta = {
                                            "meta":      "chapter-line",
                                            "chapter":   meta["chapter"],
                                            "line":      meta["text_line_counter"],
                                            "sect_pos":  sect_pos,
                                            "sect_sent": meta[
                                                             "text_line_counter"
                                                         ],
                                            "sent_id":   meta["text_line_id"],
                                            "sent_pos":  sent_pos,
                                        }
                                        t.startchar = start_char
                                        t.endchar = start_char + len(
                                            t.original
                                        )
                                        yield t

                                        # # Emit Devanagari
                                        # t.text = slp2deva(iast2slp(t.text))
                                        # t.mode = "skip"
                                        # yield t

                                        start_char += len(t.original) + 1
                                else:
                                    if FORM == "_":
                                        t.text = t.original
                                    else:
                                        sect_pos += 1
                                        sent_pos += 1

                                        t.text = FORM
                                        t.original = FORM
                                        t.pos = sent_pos
                                    t.lemma = LEMMA
                                    t.dcs_id = LEMMA_ID
                                    t.morphosyntax = XPOS
                                    t.morpho = None if MORPHO == "_" or not MORPHO else parse_morpho(XPOS, MORPHO)
                                    t.synset = None if SEM == "_" else SEM
                                    t.meta = {
                                        "meta":      "chapter-line",
                                        "chapter":   meta["chapter"],
                                        "line":      meta["text_line_counter"],
                                        "sect_pos":  sect_pos,
                                        "sect_sent": meta[
                                                         "text_line_counter"
                                                     ],
                                        "sent_id":   meta["text_line_id"],
                                        "sent_pos":  sent_pos,
                                    }
                                    t.startchar = start_char
                                    t.endchar = start_char + len(t.original)
                                    yield t

                                    start_char += len(t.original) + 1
Exemplo n.º 15
0
    def __call__(
        self,
        value: str,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix") == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            stopchars = '!"#$%()*+,-—./:;<=>?@[\]^_`{|}~'
            punctuation = str.maketrans("", "", stopchars)

            if t.mode == "query":
                t.original = t.text = value.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    # Assume value is a list
                    for pos, token in enumerate(value):
                        t.original = t.text = token
                        t.boost = 1.0
                        if positions:
                            t.pos = pos
                        if chars:
                            t.startchar = start_char
                            t.endchar = start_char + len(token)
                            start_char += len(token)
                        yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    work_pos = 0
                    for i, sentence in enumerate(
                            sent_tokenizer.tokenize(value)):
                        sent_pos = 0
                        for token in word_tokenizer.word_tokenize(sentence):
                            if token in string.whitespace:
                                start_char += 1
                                continue
                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token
                            original_length = len(token)
                            t.stopped = False

                            token = convert_diphthongs(
                                strip_diacritics(token)).translate(jvmap)
                            if token in stopchars:
                                start_char += original_length
                                continue
                            t.text = token.translate(punctuation)

                            if positions:
                                t.pos = start_pos + work_pos
                            if chars:
                                t.startchar = start_char
                                t.endchar = start_char + original_length
                            t.meta = {"sent_id": i, "sent_pos": sent_pos}
                            if mode == "index" and self.cached:
                                self._cache.append(copy.copy(t))
                            yield t

                            work_pos += 1
                            sent_pos += 1
                            start_char += original_length
                        start_char += 1
Exemplo n.º 16
0
    def __call__(
        self,
        value,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if (kwargs.get("docix", None) == self._docix
                and self._cache is not None):
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)
            if t.mode == "query":
                t.original = t.text = value.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    text = "\n".join([el for el in flatten(value["text"])])
                    t.original = t.text = text
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(text)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    tokenizer = PunktLatinCharsVars()
                    stopchars = str.maketrans("", "", string.punctuation)

                    doc = value["text"]
                    divs = [
                        cref.get("n") for cref in doc.findall(
                            ".//{http://www.tei-c.org/ns/1.0}cRefPattern")
                    ]
                    tei_base = "/tei:TEI/tei:text/tei:body/tei:div"
                    # Prose divisions
                    sentences = doc.xpath(
                        tei_base + ("/tei:div" * len(divs)),
                        namespaces={"tei": "http://www.tei-c.org/ns/1.0"},
                    )
                    # Fall back to poetry divisions
                    if len(sentences) == 0:
                        sentences = doc.xpath(
                            tei_base + ("/tei:div" * (len(divs) - 1)) +
                            "/tei:l",
                            namespaces={"tei": "http://www.tei-c.org/ns/1.0"},
                        )
                    # Fall back to speaker divisions (plays)
                    if len(sentences) == 0:
                        sentences = doc.xpath(
                            tei_base + ("/tei:div" * (len(divs) - 1)) +
                            "/tei:sp/tei:l",
                            namespaces={"tei": "http://www.tei-c.org/ns/1.0"},
                        )
                    for i, sentence in enumerate(sentences):
                        meta = {
                            "meta": "-".join(divs),
                            divs[-1]: sentence.get("n"),
                            "sent_id": i,
                        }

                        el = sentence
                        j = -1
                        while el is not None:
                            if el.getparent() is not None:
                                if (el.getparent().get("type", None)
                                        == "textpart" or el.getparent().tag
                                        == "{http://www.tei-c.org/ns/1.0}sp"):
                                    if (el.getparent().tag ==
                                            "{http://www.tei-c.org/ns/1.0}sp"):
                                        meta["speaker"] = (el.getparent().find(
                                            ".//{http://www.tei-c.org/ns/1.0}speaker"
                                        ).text)
                                    elif (el.getparent().get(
                                            "type", None) == "textpart"):
                                        j -= 1
                                        meta[divs[j]] = el.getparent().get("n")
                            el = el.getparent()

                        text = sentence.text
                        # If the text is not embedded in an XML node, use the 'text' attribute

                        if not text:
                            text = stringify(sentence)

                        if not text:
                            continue

                        tokens = []
                        temp_tokens = tokenizer.word_tokenize(text.strip())

                        if temp_tokens:
                            if (temp_tokens[0].replace("j",
                                                       "i").replace("v", "u")
                                    not in proper_names.proper_names):
                                temp_tokens[0] = temp_tokens[0]

                            for ix, token in enumerate(temp_tokens):
                                ppp = compound.is_ppp(token)
                                if ppp and ix < len(temp_tokens) - 2:
                                    copula = compound.is_copula(
                                        temp_tokens[ix + 2])  # whitespace
                                    if copula and ppp[1] == copula[2]:
                                        tense, mood, number, i = copula
                                        token = f"{token} {compound.copula[tense][mood][number][i]}"
                                        del temp_tokens[ix + 1:ix + 3]
                                        tokens.insert(ix, token)
                                    else:
                                        tokens.append(token)
                                else:
                                    tokens.append(token)

                        pos = 0
                        for token in tokens:
                            meta["sent_pos"] = pos

                            t.boost = 1.0
                            if keeporiginal:
                                t.original = token
                            t.stopped = False
                            token = convert_diphthongs(
                                strip_diacritics(token)).translate(jvmap)

                            if positions:
                                t.pos = start_pos + pos
                            if (token == " " or token in punctuation
                                    or token in stopchars):
                                pos += 1
                                continue
                            original_length = len(token)

                            token = token.strip()
                            ltoken = token.lstrip(string.punctuation)
                            ldiff = original_length - len(ltoken)
                            if ldiff != 0:
                                token = ltoken
                            rtoken = token.rstrip(string.punctuation)
                            rdiff = len(token) - len(rtoken)
                            if rdiff != 0:
                                token = rtoken
                            ntoken = token.translate(stopchars)
                            ndiff = len(token) - len(ntoken)
                            if ndiff:
                                token = ntoken
                            if not token:
                                start_char += original_length
                                continue

                            t.meta = copy.deepcopy(meta)

                            is_enclitic = False
                            if token not in exceptions:
                                if t.original in replacements:
                                    for subtoken in replacements[t.original]:
                                        t.text = subtoken
                                        t.startchar = start_char
                                        t.endchar = (start_char +
                                                     original_length)
                                        if mode == "index":
                                            self._cache.append(
                                                copy.deepcopy(t))
                                        yield t
                                    start_char += original_length
                                    continue

                                if re.match(r"(?:\w+) (?:\w+)", token):
                                    ppp, copula = token.split(" ")
                                    t.text = ppp
                                    t.startchar = start_char
                                    t.endchar = start_char + len(ppp) + 1
                                    if mode == "index":
                                        self._cache.append(copy.deepcopy(t))
                                    yield t
                                    t.text = copula
                                    t.startchar = start_char + len(ppp)
                                    t.endchar = (start_char + len(ppp) +
                                                 len(copula))
                                    if mode == "index":
                                        self._cache.append(copy.deepcopy(t))
                                    yield t
                                    start_char += original_length
                                    continue

                                for enclitic in enclitics:
                                    if token.endswith(enclitic):
                                        if enclitic == "ne":
                                            t.text = token[:-len(enclitic)]
                                            t.startchar = start_char
                                            t.endchar = start_char + (
                                                len(token) - len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "ne"
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        elif enclitic == "n":
                                            t.text = (token[:-len(enclitic)] +
                                                      "s")
                                            t.startchar = start_char
                                            t.endchar = (start_char +
                                                         len(token) -
                                                         len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "ne"
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        elif enclitic == "st":
                                            if token.endswith("ust"):
                                                t.text = token[:-len(enclitic)]
                                                t.startchar = start_char
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) -
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                                t.text = "est"
                                                t.startchar = start_char + len(
                                                    token[:-len(enclitic)])
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) +
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                            else:
                                                t.text = token[:-len(enclitic)]
                                                t.startchar = start_char
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) -
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                                t.text = "est"
                                                t.startchar = start_char + len(
                                                    token[:-len(enclitic)])
                                                t.endchar = (start_char + len(
                                                    token[:-len(enclitic)]) +
                                                             len(enclitic))
                                                if mode == "index":
                                                    self._cache.append(
                                                        copy.deepcopy(t))
                                                yield t
                                        elif enclitic == "'s":
                                            t.text = token + "s"
                                            t.startchar = start_char
                                            t.endchar = start_char + len(token)
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = "es"
                                            t.startchar = (start_char +
                                                           len(token) + 1)
                                            t.endchar = (start_char +
                                                         len(token) +
                                                         len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        else:
                                            t.text = token[:-len(enclitic)]
                                            t.startchar = start_char
                                            t.endchar = start_char + len(
                                                token[:-len(enclitic)])
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                            t.text = enclitic
                                            t.startchar = start_char + len(
                                                token[:-len(enclitic)])
                                            t.endchar = (
                                                start_char +
                                                len(token[:-len(enclitic)]) +
                                                len(enclitic))
                                            if mode == "index":
                                                self._cache.append(
                                                    copy.deepcopy(t))
                                            yield t
                                        is_enclitic = True
                                        break
                            if not is_enclitic:
                                t.text = token
                                if chars:
                                    t.startchar = start_char + ldiff
                                    t.endchar = (start_char + original_length -
                                                 rdiff)
                                if mode == "index":
                                    self._cache.append(copy.deepcopy(t))
                                yield t
                            start_char += original_length
                        start_char += 1