Пример #1
0
def make_lematizer():
    for root in get_data(DATA_DIRECTORY, parser):
        if root is not None:
            tokens = [
                extract_annotations(entry)
                for entry in root.findall(".//tok_anno")
            ]
            # lemmata = extract_by_tag("lemma", tokens)
            # lemmata_set = set(lemmata)
            normalized_to_lemma = collections.defaultdict(set)
            l = [[token["norm"], token["lemma"]] for token in tokens
                 if "lemma" in token]
            # {token["norm"]: token["lemma"] for token in tokens if "lemma" in token}
            for norm, lemma in l:
                normalized_to_lemma[norm].add(lemma)
            yield normalized_to_lemma
            #
            # pos_tags = extract_by_tag("pos", tokens)
            #
            # pos_set = set(pos_tags)
            #
            # pos_to_lemmata = {pos: {token["lemma"] for token in tokens
            #                         if "lemma" in token and "pos" in token and token["pos"] == pos}
            #                   for pos in pos_set}
        else:
            print("no root")
Пример #2
0
def make_pos_tagger_to_lemma():
    for root in get_data(DATA_DIRECTORY, parser):
        if root is not None:
            tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")]
            pos_tags = extract_by_tag("pos", tokens)
            pos_set = set(pos_tags)

            pos_to_lemmata = {pos: {token["lemma"] for token in tokens
                                    if "lemma" in token and "pos" in token and token["pos"] == pos}
                              for pos in pos_set}
            yield pos_to_lemmata
Пример #3
0
def make_norm_to_pos_tagger():
    for root in get_data(DATA_DIRECTORY, parser):
        if root is not None:
            tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")]
            norm = extract_by_tag("norm", tokens)
            norm_set = set(norm)

            pos_to_norm = {norm: {token["pos"] for token in tokens
                                  if "lemma" in token and "pos" in token and token["norm"] == norm}
                           for norm in norm_set}
            yield pos_to_norm
Пример #4
0
def make_lemma_to_pos_tagger():
    for root in get_data(DATA_DIRECTORY, parser):
        if root is not None:
            tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")]
            lemmata = extract_by_tag("lemma", tokens)
            lemmata_set = set(lemmata)

            pos_tags = extract_by_tag("pos", tokens)
            pos_set = set(pos_tags)

            lemmata_to_pos = {lemma: {token["pos"] for token in tokens
                                      if "lemma" in token and "pos" in token and token["lemma"] == lemma}
                              for lemma in lemmata_set}
            yield lemmata_to_pos
Пример #5
0
def make_lemma_to_forms():
    for root in get_data(DATA_DIRECTORY, parser):
        if root is not None:
            tokens = [
                extract_annotations(entry)
                for entry in root.findall(".//tok_anno")
            ]
            lemmata = extract_by_tag("lemma", tokens)
            lemmata_set = set(lemmata)
            lemma_to_normalized = {
                lemma: {
                    token["norm"]
                    for token in tokens if "norm" in token and "lemma" in token
                    and token["lemma"] == lemma
                }
                for lemma in lemmata_set
            }
            yield lemma_to_normalized
def read_xml_annotations():
    for root in get_data(DATA_DIRECTORY, parser):
        if root is not None:
            yield read_text_from_root(root)