Python Corpus示例，flexpy.Corpus.Corpus Python示例

示例#1

0

显示文件

文件： MksTonePairs.py 项目： Kuhron/flexpy

from flexpy.Corpus import Corpus
from flexpy.tags.RtLexEntry import RtLexEntry
from flexpy.LexEntry import LexEntry
from flexpy.FlexPyUtil import get_tone_letters_from_string

project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
project_name = "IxpantepecMixtec"

corpus = Corpus(project_dir, project_name, include_punctuation=False)
tag_dict = corpus.tag_dict

# desired result:
# list of lexical items, each place it occurs in 1st position
# and each place it occurs in 2nd position
# show baseline tone pattern and perturbed tone pattern of both
# look at this big list to see what jumps out

by_lex_guid = {}

for text in corpus.texts:
    # print(f"current text is {text}")

    # iterate over MORPHEMES in the text (not across paragraph boundaries)
    # know the LexEntry that each of them belongs to
    if text.paragraphs is None:
        # print(f"skipping text {text} due to lack of paragraphs")
        continue
    for pg in text.paragraphs:
        # print("\n---- new paragraph ----\n")
        # print(f"current paragraph is: {pg}")
        # print(f"run texts is: {pg.run_texts}")

示例#2

0

显示文件

文件： WriteTextsToFile.py 项目： Kuhron/flexpy

from flexpy.Corpus import Corpus

project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
project_name = "Bara"
corpus = Corpus(project_dir, project_name, include_punctuation=True)

output_dir = "/home/wesley/Desktop/UOregon Work/CorpusLinguistics/corpora/Bara/"
corpus.write_texts_to_file(output_dir)

示例#3

0

显示文件

    medial_tags = ["ss", "ds"]
    final_tags = ["fpst", "rpst", "prs", "ifut", "fut", "imp", "irr"]
    if any(component in medial_tags for component in gloss_components):
        # check these first, they will override the others, e.g. irr.ds is medial, not final
        return "medial"
    elif any(component in final_tags for component in gloss_components):
        return "final"
    else:
        raise Exception("gloss unclassifiable for finality: {}".format(gloss))


if __name__ == "__main__":
    project_name = "Bongu"
    project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
    include_punctuation = True
    corpus = Corpus(project_dir, project_name, include_punctuation)

    texts_to_include = None  # pass as None to get all non-omitted texts
    # texts_to_include = ["Help"]  # test stuff with a single shorter text
    texts_to_omit = [None, "None", "*Nouns", "*Ungram.", "*Random", "*Verbs"]
    wordform_contents = corpus.get_wordform_contents(
        texts_separated=True,
        sentences_separated=True,
        paragraphs_separated=False,
        texts_to_include=texts_to_include,
        texts_to_omit=texts_to_omit,
    )
    # print(wordform_contents)
    sentences = []
    for text in wordform_contents:
        for sentence in text:

示例#4

0

显示文件

文件： SearchGlosses.py 项目： Kuhron/flexpy

from flexpy.Corpus import Corpus
from flexpy.Lexicon import Lexicon

project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
project_name = "Bara"
corpus = Corpus(project_dir, project_name, include_punctuation=False)
contents = corpus.get_tokenized_contents()

regex = r"\b(eat|drink|kaikai|dring)"
# regex = r"\.ss"
# regex = r"^(to )?(eat|drink|kaikai|dring(im)?)$"
words = corpus.search_lexicon_glosses(regex)
print("searched for words containing gloss regex {}".format(regex))
print("Results: {}".format(words))

# print("searching in word glosses of texts")
# TODO
# words2 = bongu_corpus.search_word_glosses(regex)
# print("Results: {}".format(words2))

# print("searching in free translations of texts")
# TODO
# lines = bongu_corpus.search_free_translations(regex)
# print("Results: {}".format(lines))

示例#5

0

显示文件

文件： LoadBongu.py 项目： Kuhron/flexpy

import random

from flexpy.Corpus import Corpus
import flexpy.XMLTagMap as xml_tag_map


project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
project_name = "Bongu"

bongu_corpus = Corpus(project_dir, project_name, include_punctuation=False)
contents = bongu_corpus.get_tokenized_contents()

dependency_dict = bongu_corpus.tag_dict.dependency_dict

random_rt = random.choice(bongu_corpus.tag_dict.all_elements())
print("\n-- class definition for {}".format(random_rt.attrib["class"]))
print(xml_tag_map.create_tag_class_definition(random_rt, dependency_dict))

# test instantiating a tag class from an element
dict_of_single_class = bongu_corpus.tag_dict["RtCmAnnotationDefn"]
random_guid = random.choice(list(dict_of_single_class.keys()))
els_with_guid = dict_of_single_class[random_guid]
el = els_with_guid[0]
print("\n-- attempting object instantiation from element {}".format(el))
obj = bongu_corpus.tag_dict.get_python_object_from_element(el)
print(obj)
print(obj.__dict__)

print("success")

示例#6

0

显示文件

文件： min_script.py 项目： Kuhron/flexpy

        if any(k[-len(w):] == w for w in words_of_interest)
    ]
    freqs_of_interest_ending = {
        k: ngram_freq[k]
        for k in keys_ending_with_words_of_interest
    }
    print("\n---- ngrams containing words of interest at end ----")
    ct.head(freqs_of_interest_ending, hits=10)

    # show collocations of most frequent words or words of interest
    # n_most_frequent_words = 5
    # collocation_words_tups = get_top_n_dict_items(freq, n_most_frequent_words)
    metrics = ["MI", "T", "freq", "right", "left"]
    # for word, _ in collocation_words_tups:
    for word in words_of_interest:
        for metric in metrics:
            collocates = ct.collocator(get_new_tokenized(), word, stat=metric)
            print("----\nCollocations for {} using stat={}:".format(
                word, metric))
            ct.head(collocates, hits=10)


if __name__ == "__main__":
    project_name = "Isan"
    project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
    corpus = Corpus(project_dir, project_name, include_punctuation=False)
    contents = corpus.get_contents()

    print("corpus size: {}".format(corp.get_corpus_size_words(contents)))
    perform_min_analysis(contents)

示例#7

0

显示文件

文件： FindCVTokens.py 项目： Kuhron/flexpy

import re

from flexpy.Corpus import Corpus
from flexpy.Text import Text
from flexpy.WordForm import WordForm, WordFormMorpheme

if __name__ == "__main__":
    project_name = "Bongu"
    project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
    corpus = Corpus(project_dir, project_name, include_punctuation=False)

    texts_to_omit = [None, "None", "*Nouns", "*Ungram.", "*Random", "*Verbs"]
    # wordform_contents = corpus.get_wordform_contents(
    #     texts_separated=True,
    #     paragraphs_separated=False,
    #     texts_to_omit=texts_to_omit,
    # )
    contents = corpus.get_tokenized_contents(texts_to_omit=texts_to_omit)
    report_individual_matches = False  # for printing each token's line

    # token_regex = r"(^|\b)(?P<token>[pbtdkgqj][aeiou])\s?[^aoeuimnñŋlr]"  # certain word-initial stop-vowel combos
    onsets_segs = ["p", "t", "k", "b", "d", "g", "q", "j"]
    vowels_segs = ["a", "e", "i", "o", "u"]
    onsets = [
        "[{}]".format("".join(onsets_segs))
    ]  # hack to get the regex to think of it as a single character class, and only iterate for loop for one onset
    # onsets = "d"
    all_counts = {}
    for onset in onsets:
        # token_regex = r"(^|\b)(?P<token>" + onset + "[aeiou]\s?[tds])"
        token_regex = r"(^|\b)(?P<token>" + onset + "[{}])".format(

示例#8

0

显示文件

        "Yangulam",
    ]
    print("found {} languages".format(len(language_names)))
    project_dir = "/home/wesley/.local/share/fieldworks/Projects/"
    lexicons = {}
    wordlist_dir = "/home/wesley/flexpy/flexpy/language_data/MadangWordlists/"
    for ln in language_names:
        print("-- loading language {}".format(ln))
        filename = "{}.tsv".format(ln)
        fp = os.path.join(wordlist_dir, filename)
        if os.path.exists(fp):
            lexicon = load_lexicon_from_tsv(fp, ln)
            lexicons[ln] = lexicon
        else:
            print("creating tsv for lexicon of {}".format(ln))
            corpus = Corpus(project_dir, ln, include_punctuation=False)
            write_lexicon_tsv(corpus, fp)
            assert os.path.exists(fp)  # should be there now that we wrote it
            lexicon = load_lexicon_from_tsv(fp, ln)
            lexicons[ln] = lexicon

        print("{} has {} lexeme entries".format(ln, len(lexicon.lexemes)))

    test_show_similarity(lexicons)
    lexicon1, lexicon2 = random.sample(list(lexicons.values()), 2)
    glosses = [
        "man",
        "woman",
        "canoe",
        "sun",
        "red",