def main(): words = uti.set_reader(os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_preprocessed.txt'])) roots = uti.set_reader(os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_preprocessed.txt'])) words_from_roots = {(word[:len(root)] + '+' + word[len(root):]).rstrip('+') for word in words for root in roots if word.startswith(root)} uti.write_iter(words_from_roots, os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_dl_parsed_preprocessed.txt' ])) words_from_roots_pl = ay_trans.pl_trans(words_from_roots) uti.write_iter(words_from_roots_pl, os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_dl_parsed_pl.txt' ])) words_from_roots_ej = {ay_trans.nth_transcribe(word, ay.ejectives) for word in words_from_roots} uti.write_iter(words_from_roots_ej, os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_dl_parsed_preprocessed_ejectives.txt' ])) words_from_roots_ej_pl = ay_trans.pl_trans(words_from_roots_ej) uti.write_iter(words_from_roots_ej_pl, os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_dl_parsed_pl_ejectives.txt' ]))
def main(): # Reading in files nk_orth_roots = uti.set_reader(os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'nk_roots_pretrans.txt'))) nk_orth_forms = uti.set_reader(os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'nk_forms_pretrans.txt'))) nk_orth_pars = uti.set_reader(os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'nk_forms_sep_pretrans.txt'))) # Transcribing & Writing # Roots nk_trans_roots = transcribe(nk_orth_roots, palatalize=True) uti.write_iter(nk_trans_roots, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'Preprocessed', 'nk_roots_preprocessed.txt'))) nk_ipa_roots = ipa_trans(nk_trans_roots) uti.write_iter(nk_ipa_roots, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'IPA', 'nk_roots_ipa.txt'))) nk_pl_roots = uti.pl_trans(nk_trans_roots) uti.write_iter(nk_pl_roots, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'UCLAPL', 'nk_roots_pl.txt'))) # Word forms nk_trans_forms = transcribe(nk_orth_forms, palatalize=True) uti.write_iter(nk_trans_forms, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'Preprocessed', 'nk_forms_preprocessed.txt'))) nk_ipa_forms = ipa_trans(nk_trans_forms) uti.write_iter(nk_ipa_forms, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'IPA', 'nk_forms_ipa.txt'))) nk_pl_forms = uti.pl_trans(nk_trans_forms) uti.write_iter(nk_pl_forms, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'UCLAPL', 'nk_forms_pl.txt'))) # Parsed word forms nk_trans_pars = transcribe(nk_orth_pars, palatalize=True) uti.write_iter(nk_trans_pars, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'Preprocessed', 'nk_forms_sep_preprocessed.txt'))) nk_ipa_pars = ipa_trans(nk_trans_pars) uti.write_iter(nk_ipa_pars, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'IPA', 'nk_forms_sep_ipa.txt'))) nk_pl_pars = uti.pl_trans(nk_trans_pars) uti.write_iter(nk_pl_pars, os.path.join(*( os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'UCLAPL', 'nk_forms_sep_pl.txt')))
def main(): roots, suffixes = delucca_reader( os.path.join(*[os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_delucca_segmented.txt'])) uti.write_iter(roots, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca.txt'])) uti.write_iter(suffixes,os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_suffixes_delucca.txt'])) lowering_table = ay_trans.make_lowering_table() roots_trans = ay_trans.transcribe(roots, lowering=lowering_table) roots_pl = ay_trans.pl_trans(roots_trans) roots_ipa = ay_trans.ipa_trans(roots_trans) roots_trans_ej = {ay_trans.nth_transcribe(word, ay.ejectives) for word in roots_trans} roots_ej_pl = ay_trans.pl_trans(roots_trans_ej) suffixes_trans = ay_trans.transcribe(suffixes, lowering=lowering_table) uti.write_iter(roots_trans, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_preprocessed.txt'])) uti.write_iter(roots_pl, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_pl.txt'])) uti.write_iter(roots_ipa, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_ipa.txt'])) uti.write_iter(suffixes_trans, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_suffixes_delucca_preprocessed.txt'])) uti.write_iter(roots_trans_ej, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_preprocessed_ejectives.txt'])) uti.write_iter(roots_ej_pl, os.path.join(*[ os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_pl_ejectives.txt'])) ay_words = uti.set_reader(os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_preprocessed.txt'])) subcorpus = rid_of_starters(ay_words, roots_trans) roots = set_stemmer(subcorpus, suffixes_trans, ay) uti.write_iter(roots, os.path.join(*[ os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_roots_from_wordforms_preprocessed.txt']))
def main(): ay_orth = uti.set_reader( os.path.join( *[os.pardir, "Aymara", "Outputs", "Aymara_words_no_sp_en.txt"])) lowering_table = make_lowering_table() ay_trans = transcribe(ay_orth, lowering=lowering_table) uti.write_iter( ay_trans, os.path.join(*[ os.pardir, "Aymara", "Outputs", "Transcription", "aymara_preprocessed.txt" ])) ay_ipa = ipa_trans(ay_trans) uti.write_iter( ay_ipa, os.path.join(*[ os.pardir, "Aymara", "Outputs", "Transcription", "aymara_ipa.txt" ])) ay_pl = pl_trans(ay_trans) uti.write_iter( ay_pl, os.path.join(*[ os.pardir, "Aymara", "Outputs", "Transcription", "aymara_pl.txt" ])) ay_trans_ej = {nth_transcribe(wrd, ay.ejectives) for wrd in ay_trans} uti.write_iter( ay_trans_ej, os.path.join(*[ os.pardir, "Aymara", "Outputs", "Transcription", "aymara_preprocessed_ejectives.txt" ])) ay_pl_ej = pl_trans(ay_trans_ej) uti.write_iter( ay_pl_ej, os.path.join(*[ os.pardir, "Aymara", "Outputs", "Transcription", "aymara_pl_ejectives.txt" ]))
def main(): sib_sib = {sib1 + sib2 for sib1 in nk.sibilants for sib2 in nk.sibilants} nk_roots = { word.replace(' ', '') for word in uti.set_reader( os.path.join(*(os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'Preprocessed', 'nk_roots_preprocessed.txt'))) } nk_forms = { word.replace(' ', '') for word in uti.set_reader( os.path.join(*(os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'Preprocessed', 'nk_forms_preprocessed.txt'))) } nk_forms_parsed = { word.replace(' ', '') for word in uti.set_reader( os.path.join(*(os.pardir, 'NkoreKiga', 'Outputs', 'Transcription', 'Preprocessed', 'nk_forms_sep_preprocessed.txt'))) } nk_corpora = [nk_roots, nk_forms, nk_forms_parsed] corp_names = ['roots', 'forms', 'formssep'] for i, corpus in enumerate(nk_corpora): corp_name = corp_names[i] # All trigrams trigram_counts = cou.trigram_counter(corpus) uti.write_dict( trigram_counts, os.path.join(*(os.pardir, 'NkoreKiga', 'Outputs', 'Counts', 'Raw', 'nk_counts_tri_{}.txt'.format(corp_name)))) # Sibilant tier nk_ss_list, nk_ss_count = cou.count_many_substr(sib_sib, corpus, nk, tier=nk.sibilants, return_set=True) nk_ss_count_formatted = { key[0] + ' anything ' + key[1]: nk_ss_count[key] for key in nk_ss_count.keys() } uti.write_dict( nk_ss_count_formatted, os.path.join(*(os.pardir, 'NkoreKiga', 'Outputs', 'Counts', 'Raw', 'nk_counts_sib_tier_{}.txt'.format(corp_name)))) nk_ss_fric_list, nk_ss_fric_count = cou.count_many_substr( sib_sib, corpus, nk, tier=nk.sibilant_fricatives, return_set=True) nk_ss_fric_count_formatted = { key[0] + ' anything ' + key[1]: nk_ss_fric_count[key] for key in nk_ss_fric_count.keys() } uti.write_dict( nk_ss_fric_count_formatted, os.path.join( *(os.pardir, 'NkoreKiga', 'Outputs', 'Counts', 'Raw', 'nk_counts_sib_fric_tier_{}.txt'.format(corp_name))))
def main(): parsed_words = { word.replace(' ', '') for word in uti.set_reader( os.path.join(*(os.pardir, 'Aymara', 'Inputs', 'ay_words_parsed.txt'))) } trigrams_parsed = cou.trigram_counter(parsed_words) uti.write_dict( trigrams_parsed, os.path.join(*(os.pardir, 'Aymara', 'Outputs', 'Counts', 'Raw', 'aymara_counts_seg_all_trigrams_parsed.txt'))) """ words = uti.set_reader(os.path.join(*( os.pardir, 'Aymara', 'Outputs', 'Transcription', 'aymara_preprocessed.txt'))) roots = uti.set_reader(os.path.join(*( os.pardir, 'Aymara', 'Inputs', 'delucca', 'ay_roots_delucca_preprocessed.txt'))) ay_corpora = [words, roots] ay_corpus_names = ['words', 'roots'] # Aymara sounds: Defining certain ngrams precon_asp = {aspirate + consonant for aspirate in ay.aspirates for consonant in ay.consonants} precon_ej = {ejective + consonant for ejective in ay.ejectives for consonant in ay.consonants} precon_plain = {plain + consonant for plain in ay.plain_stops for consonant in ay.consonants} precon_stops = precon_asp.union(precon_ej, precon_plain) prevoc_asp = {aspirate + vowel for aspirate in ay.aspirates for vowel in ay.vowels} prevoc_ej = {ejective + vowel for ejective in ay.ejectives for vowel in ay.vowels} prevoc_plain = {plain + vowel for plain in ay.plain_stops for vowel in ay.vowels} prevoc_stops = prevoc_asp.union(prevoc_ej, prevoc_plain) cc = {c1 + c2 for c1 in ay.consonants for c2 in ay.consonants} cvc = {c1 + v + c2 for c1 in ay.consonants for v in ay.vowels for c2 in ay.consonants} cvcv = {c1 + v + c2 + v2 for c1 in ay.consonants for v in ay.vowels for c2 in ay.consonants for v2 in ay.vowels} stop_v_stop = {stop1 + vowel + stop2 for stop1 in ay.stops for vowel in ay.vowels for stop2 in ay.stops} asp_v_asp = {a1 + v + a2 for a1 in ay.aspirates for v in ay.vowels for a2 in ay.aspirates} asp_v_ej = {a + v + e for a in ay.aspirates for v in ay.vowels for e in ay.ejectives} asp_v_plain = {a + v + p for a in ay.aspirates for v in ay.vowels for p in ay.plain_stops} ej_v_asp = {e + v + a for e in ay.ejectives for v in ay.vowels for a in ay.aspirates} ej_v_ej = {e1 + v + e2 for e1 in ay.ejectives for v in ay.vowels for e2 in ay.ejectives} ej_v_ej_het = {e1 + v + e2 for e1 in ay.ejectives for v in ay.vowels for e2 in ay.ejectives if e1 != e2} ej_v_plain = {e + v + p for e in ay.ejectives for v in ay.vowels for p in ay.plain_stops} plain_v_asp = {p + v + a for p in ay.plain_stops for v in ay.vowels for a in ay.aspirates} plain_v_ej = {p + v + e for p in ay.plain_stops for v in ay.vowels for e in ay.ejectives} plain_v_plain = {p1 + v + p2 for p1 in ay.plain_stops for v in ay.vowels for p2 in ay.plain_stops} ee_het = {e1 + e2 for e1 in ay.ejectives for e2 in ay.ejectives if e1 != e2} # Counting for i, corpus in enumerate(ay_corpora): corpus_name = ay_corpus_names[i] """ """ ## Stops unigram_counts = cou.count_many_substr(ay.sounds, corpus, ay) unigram_counts['stops'] = sum(unigram_counts[key] for key in ay.stops) unigram_counts['plain_stops'] = sum(unigram_counts[key] for key in ay.plain_stops) unigram_counts['aspirates'] = sum(unigram_counts[key] for key in ay.aspirates) unigram_counts['ejectives'] = sum(unigram_counts[key] for key in ay.ejectives) unigram_counts['consonants'] = sum(unigram_counts[key] for key in ay.consonants) unigram_counts['vowels'] = sum(unigram_counts[key] for key in ay.vowels) unigram_counts['total'] = sum(unigram_counts.values()) uti.write_dict(unigram_counts, os.path.join(*( os.pardir, 'Aymara', 'Outputs', 'Counts', 'Raw', 'aymara_counts_seg_all_unigram_{}.txt'.format(corpus_name)))) """ """ ## Preceding environments for stops precon_stop_counts = cou.count_many_substr(precon_stops, corpus, ay) prevoc_stop_counts = cou.count_many_substr(prevoc_stops, corpus, ay) stop_bigrams = { 'preconsonantal stops': sum(precon_stop_counts.values()), 'prevocalic stops': sum(prevoc_stop_counts.values()), 'total non-final stops': sum(precon_stop_counts.values()) + sum(prevoc_stop_counts.values()) } stop_bigrams['preconsonantal aspirates'] \ = sum(precon_stop_counts[key] for key in precon_asp) stop_bigrams['preconsonantal ejectives'] \ = sum(precon_stop_counts[key] for key in precon_ej) stop_bigrams['preconsonantal plain stops'] \ = sum(precon_stop_counts[key] for key in precon_plain) stop_bigrams['prevocalic aspirates'] \ = sum(prevoc_stop_counts[key] for key in prevoc_asp) stop_bigrams['prevocalic ejectives'] \ = sum(prevoc_stop_counts[key] for key in prevoc_ej) stop_bigrams['prevocalic plain stops'] \ = sum(prevoc_stop_counts[key] for key in prevoc_plain) uti.write_dict(stop_bigrams, os.path.join(*( os.pardir, 'Aymara', 'Outputs', 'Counts', 'Raw', 'aymara_counts_seg_all_env_{}.txt'.format(corpus_name)))) """ """ ## Prevocalic stops initially or not prevoc_stop_initial_counts = cou.count_many_substr(prevoc_stops, corpus, ay, initial=True) prevoc_counts_wordpos = { 'initial prevocalic stops': sum(prevoc_stop_initial_counts.values()), 'medial prevocalic stops': sum(prevoc_stop_counts.values()) - sum(prevoc_stop_initial_counts.values()), 'total prevocalic stops': sum(prevoc_stop_counts.values()), 'initial prevocalic aspirates': sum(prevoc_stop_initial_counts[key] for key in prevoc_asp), 'medial prevocialic aspirates': sum(prevoc_stop_counts[key] for key in prevoc_asp) - sum(prevoc_stop_initial_counts[key] for key in prevoc_asp), 'total prevocalic aspirates': sum(prevoc_stop_counts[key] for key in prevoc_asp), 'initial prevocalic ejectives': sum(prevoc_stop_initial_counts[key] for key in prevoc_ej), 'medial prevocalic ejectives': sum(prevoc_stop_counts[key] for key in prevoc_ej) - sum(prevoc_stop_initial_counts[key] for key in prevoc_ej), 'total prevocalic ejectives': sum(prevoc_stop_counts[key] for key in prevoc_ej), 'initial prevocalic plain stops': sum(prevoc_stop_initial_counts[key] for key in prevoc_plain), 'medial prevocalic plain stops': sum(prevoc_stop_counts[key] for key in prevoc_plain) - sum(prevoc_stop_initial_counts[key] for key in prevoc_plain), 'total prevocalic plain stops': sum(prevoc_stop_counts[key] for key in prevoc_plain) } uti.write_dict(prevoc_counts_wordpos, os.path.join(*( os.pardir, 'Aymara', 'Outputs', 'Counts', 'Raw', 'aymara_counts_seg_prevoc_wordpos_{}.txt'.format(corpus_name)))) """ """