from flexpy.Corpus import Corpus from flexpy.tags.RtLexEntry import RtLexEntry from flexpy.LexEntry import LexEntry from flexpy.FlexPyUtil import get_tone_letters_from_string project_dir = "/home/wesley/.local/share/fieldworks/Projects/" project_name = "IxpantepecMixtec" corpus = Corpus(project_dir, project_name, include_punctuation=False) tag_dict = corpus.tag_dict # desired result: # list of lexical items, each place it occurs in 1st position # and each place it occurs in 2nd position # show baseline tone pattern and perturbed tone pattern of both # look at this big list to see what jumps out by_lex_guid = {} for text in corpus.texts: # print(f"current text is {text}") # iterate over MORPHEMES in the text (not across paragraph boundaries) # know the LexEntry that each of them belongs to if text.paragraphs is None: # print(f"skipping text {text} due to lack of paragraphs") continue for pg in text.paragraphs: # print("\n---- new paragraph ----\n") # print(f"current paragraph is: {pg}") # print(f"run texts is: {pg.run_texts}")
from flexpy.Corpus import Corpus project_dir = "/home/wesley/.local/share/fieldworks/Projects/" project_name = "Bara" corpus = Corpus(project_dir, project_name, include_punctuation=True) output_dir = "/home/wesley/Desktop/UOregon Work/CorpusLinguistics/corpora/Bara/" corpus.write_texts_to_file(output_dir)
medial_tags = ["ss", "ds"] final_tags = ["fpst", "rpst", "prs", "ifut", "fut", "imp", "irr"] if any(component in medial_tags for component in gloss_components): # check these first, they will override the others, e.g. irr.ds is medial, not final return "medial" elif any(component in final_tags for component in gloss_components): return "final" else: raise Exception("gloss unclassifiable for finality: {}".format(gloss)) if __name__ == "__main__": project_name = "Bongu" project_dir = "/home/wesley/.local/share/fieldworks/Projects/" include_punctuation = True corpus = Corpus(project_dir, project_name, include_punctuation) texts_to_include = None # pass as None to get all non-omitted texts # texts_to_include = ["Help"] # test stuff with a single shorter text texts_to_omit = [None, "None", "*Nouns", "*Ungram.", "*Random", "*Verbs"] wordform_contents = corpus.get_wordform_contents( texts_separated=True, sentences_separated=True, paragraphs_separated=False, texts_to_include=texts_to_include, texts_to_omit=texts_to_omit, ) # print(wordform_contents) sentences = [] for text in wordform_contents: for sentence in text:
from flexpy.Corpus import Corpus from flexpy.Lexicon import Lexicon project_dir = "/home/wesley/.local/share/fieldworks/Projects/" project_name = "Bara" corpus = Corpus(project_dir, project_name, include_punctuation=False) contents = corpus.get_tokenized_contents() regex = r"\b(eat|drink|kaikai|dring)" # regex = r"\.ss" # regex = r"^(to )?(eat|drink|kaikai|dring(im)?)$" words = corpus.search_lexicon_glosses(regex) print("searched for words containing gloss regex {}".format(regex)) print("Results: {}".format(words)) # print("searching in word glosses of texts") # TODO # words2 = bongu_corpus.search_word_glosses(regex) # print("Results: {}".format(words2)) # print("searching in free translations of texts") # TODO # lines = bongu_corpus.search_free_translations(regex) # print("Results: {}".format(lines))
import random from flexpy.Corpus import Corpus import flexpy.XMLTagMap as xml_tag_map project_dir = "/home/wesley/.local/share/fieldworks/Projects/" project_name = "Bongu" bongu_corpus = Corpus(project_dir, project_name, include_punctuation=False) contents = bongu_corpus.get_tokenized_contents() dependency_dict = bongu_corpus.tag_dict.dependency_dict random_rt = random.choice(bongu_corpus.tag_dict.all_elements()) print("\n-- class definition for {}".format(random_rt.attrib["class"])) print(xml_tag_map.create_tag_class_definition(random_rt, dependency_dict)) # test instantiating a tag class from an element dict_of_single_class = bongu_corpus.tag_dict["RtCmAnnotationDefn"] random_guid = random.choice(list(dict_of_single_class.keys())) els_with_guid = dict_of_single_class[random_guid] el = els_with_guid[0] print("\n-- attempting object instantiation from element {}".format(el)) obj = bongu_corpus.tag_dict.get_python_object_from_element(el) print(obj) print(obj.__dict__) print("success")
if any(k[-len(w):] == w for w in words_of_interest) ] freqs_of_interest_ending = { k: ngram_freq[k] for k in keys_ending_with_words_of_interest } print("\n---- ngrams containing words of interest at end ----") ct.head(freqs_of_interest_ending, hits=10) # show collocations of most frequent words or words of interest # n_most_frequent_words = 5 # collocation_words_tups = get_top_n_dict_items(freq, n_most_frequent_words) metrics = ["MI", "T", "freq", "right", "left"] # for word, _ in collocation_words_tups: for word in words_of_interest: for metric in metrics: collocates = ct.collocator(get_new_tokenized(), word, stat=metric) print("----\nCollocations for {} using stat={}:".format( word, metric)) ct.head(collocates, hits=10) if __name__ == "__main__": project_name = "Isan" project_dir = "/home/wesley/.local/share/fieldworks/Projects/" corpus = Corpus(project_dir, project_name, include_punctuation=False) contents = corpus.get_contents() print("corpus size: {}".format(corp.get_corpus_size_words(contents))) perform_min_analysis(contents)
import re from flexpy.Corpus import Corpus from flexpy.Text import Text from flexpy.WordForm import WordForm, WordFormMorpheme if __name__ == "__main__": project_name = "Bongu" project_dir = "/home/wesley/.local/share/fieldworks/Projects/" corpus = Corpus(project_dir, project_name, include_punctuation=False) texts_to_omit = [None, "None", "*Nouns", "*Ungram.", "*Random", "*Verbs"] # wordform_contents = corpus.get_wordform_contents( # texts_separated=True, # paragraphs_separated=False, # texts_to_omit=texts_to_omit, # ) contents = corpus.get_tokenized_contents(texts_to_omit=texts_to_omit) report_individual_matches = False # for printing each token's line # token_regex = r"(^|\b)(?P<token>[pbtdkgqj][aeiou])\s?[^aoeuimnñŋlr]" # certain word-initial stop-vowel combos onsets_segs = ["p", "t", "k", "b", "d", "g", "q", "j"] vowels_segs = ["a", "e", "i", "o", "u"] onsets = [ "[{}]".format("".join(onsets_segs)) ] # hack to get the regex to think of it as a single character class, and only iterate for loop for one onset # onsets = "d" all_counts = {} for onset in onsets: # token_regex = r"(^|\b)(?P<token>" + onset + "[aeiou]\s?[tds])" token_regex = r"(^|\b)(?P<token>" + onset + "[{}])".format(
"Yangulam", ] print("found {} languages".format(len(language_names))) project_dir = "/home/wesley/.local/share/fieldworks/Projects/" lexicons = {} wordlist_dir = "/home/wesley/flexpy/flexpy/language_data/MadangWordlists/" for ln in language_names: print("-- loading language {}".format(ln)) filename = "{}.tsv".format(ln) fp = os.path.join(wordlist_dir, filename) if os.path.exists(fp): lexicon = load_lexicon_from_tsv(fp, ln) lexicons[ln] = lexicon else: print("creating tsv for lexicon of {}".format(ln)) corpus = Corpus(project_dir, ln, include_punctuation=False) write_lexicon_tsv(corpus, fp) assert os.path.exists(fp) # should be there now that we wrote it lexicon = load_lexicon_from_tsv(fp, ln) lexicons[ln] = lexicon print("{} has {} lexeme entries".format(ln, len(lexicon.lexemes))) test_show_similarity(lexicons) lexicon1, lexicon2 = random.sample(list(lexicons.values()), 2) glosses = [ "man", "woman", "canoe", "sun", "red",