def test_lexicon_enrichment(timed_config, csv_test_dir): path = os.path.join(csv_test_dir, 'timed_enrichment.txt') with CorpusContext(timed_config) as c: enrich_lexicon_from_csv(c, path) q = c.query_graph(c.word).filter(c.word.neighborhood_density < 10) q = q.columns(c.word.label.column_name('label')) res = q.all() assert (all(x['label'] == 'guess' for x in res)) q = c.query_graph(c.word).filter(c.word.label == 'i') res = q.all() assert (res[0]['frequency'] == 150) assert (res[0]['part_of_speech'] == 'PRP') assert (res[0]['neighborhood_density'] == 17) q = c.query_graph(c.word).filter(c.word.label == 'cute') res = q.all() assert (res[0]['frequency'] is None) assert (res[0]['part_of_speech'] == 'JJ') assert (res[0]['neighborhood_density'] == 14) levels = c.lexicon.get_property_levels('part_of_speech') assert (set(levels) == set(['NN', 'VB', 'JJ', 'IN', 'PRP']))
def test_lexicon_enrichment(timed_config, csv_test_dir): path = os.path.join(csv_test_dir, 'timed_enrichment.txt') with CorpusContext(timed_config) as c: enrich_lexicon_from_csv(c, path) q = c.query_graph(c.word).filter(c.word.neighborhood_density < 10) q = q.columns(c.word.label.column_name('label')) res = q.all() assert(all(x['label'] == 'guess' for x in res)) q = c.query_graph(c.word).filter(c.word.label == 'i') res = q.all() assert(res[0]['frequency'] == 150) assert(res[0]['part_of_speech'] == 'PRP') assert(res[0]['neighborhood_density'] == 17) q = c.query_graph(c.word).filter(c.word.label == 'cute') res = q.all() assert(res[0]['frequency'] is None) assert(res[0]['part_of_speech'] == 'JJ') assert(res[0]['neighborhood_density'] == 14) levels = c.lexicon.get_property_levels('part_of_speech') assert(set(levels) == set(['NN','VB','JJ','IN','PRP']))
def lexicon_enrichment(config, unisyn_spade_directory, dialect_code): enrichment_dir = os.path.join(unisyn_spade_directory, 'enrichment_files') if not os.path.exists(enrichment_dir): print('Could not find enrichment_files directory from {}, skipping lexical enrichment.'.format( unisyn_spade_directory)) return with CorpusContext(config) as g: for lf in os.listdir(enrichment_dir): path = os.path.join(enrichment_dir, lf) if lf == 'rule_applications.csv': if g.hierarchy.has_type_property('word', 'UnisynPrimStressedVowel1'.lower()): print('Dialect independent enrichment already loaded, skipping.') continue elif lf.startswith(dialect_code): if g.hierarchy.has_type_property('word', 'UnisynPrimStressedVowel2_{}'.format( dialect_code).lower()): print('Dialect specific enrichment already loaded, skipping.') continue else: continue begin = time.time() enrich_lexicon_from_csv(g, path) time_taken = time.time() - begin print('Lexicon enrichment took: {}'.format(time.time() - begin)) save_performance_benchmark(config, 'lexicon_enrichment', time_taken)
def run_query(self): config = self.kwargs['config'] case_sensitive = self.kwargs['case_sensitive'] path = self.kwargs['path'] stop_check = self.kwargs['stop_check'] call_back = self.kwargs['call_back'] call_back('Enriching lexicon...') call_back(0, 0) with CorpusContext(config) as c: enrich_lexicon_from_csv(c, path) if stop_check(): call_back('Resetting lexicon...') call_back(0, 0) c.reset_lexicon() return False return True
def run_query(self): print("in the lexical worker") config = self.kwargs['config'] case_sensitive = self.kwargs['case_sensitive'] path = self.kwargs['path'] stop_check = self.kwargs['stop_check'] call_back = self.kwargs['call_back'] call_back('Enriching lexicon...') call_back(0, 0) with CorpusContext(config) as c: enrich_lexicon_from_csv(c, path) self.actionCompleted.emit('enriching lexicon') if stop_check(): call_back('Resetting lexicon...') call_back(0, 0) c.reset_lexicon() return False return True
def duration_export(config, corpus_name, corpus_directory, dialect_code, speakers, vowels, stressed_vowels=None, baseline=False, ignored_speakers=None): csv_path = os.path.join(base_dir, corpus_name, '{}_duration.csv'.format(corpus_name)) with CorpusContext(config) as c: if corpus_name == 'spade-Buckeye': print("Processing {}".format(corpus_name)) if not c.hierarchy.has_type_property('word', "ContainsVowelObstruent"): print('Classifying Buckeye vowel-obstruent pairs') enrich_lexicon_from_csv( c, os.path.join( corpus_directory, "corpus-data/enrichment/buckeye_obstruents.csv")) print("Beginning duration export") beg = time.time() consonants = [ 'p', 'P', 't', 'T', 'k', 'K', 'b', 'B', 'd', 'D', 'g', 'G', 'F', 'f', 'V', 'v', 'N', 'n', 'm', 'M', 'NG', 'TH', 'DH', 'l', 'L', 'ZH', 'x', 'X', 'r', 'R', 's', 'S', 'sh', 'SH', 'z', 'Z', 'zh', 'ZH', 'J', 'C', 'tS', 'dZ', 'tq' ] if stressed_vowels: q = c.query_graph(c.phone).filter( c.phone.label.in_(stressed_vowels)) q = q.filter(c.phone.following.end == c.phone.syllable.end) q = q.filter( c.phone.following.end == c.phone.syllable.word.utterance.end) q = q.filter(c.phone.following.label.in_(consonants)) q = q.filter(c.phone.syllable.word.num_syllables == 1) else: q = c.query_graph(c.phone).filter(c.phone.label.in_(vowels)) q = q.filter(c.phone.following.end == c.phone.syllable.end) q = q.filter( c.phone.following.end == c.phone.syllable.word.utterance.end) q = q.filter(c.phone.following.label.in_(consonants)) q = q.filter(c.phone.word.stresspattern == "1") q = q.filter(c.phone.syllable.stress == "1") print(c.hierarchy) if c.hierarchy.has_type_property('word', 'containsvowelobstruent'): q = q.filter(c.phone.word.containsvowelobstruent == True) if speakers: q = q.filter(c.phone.speaker.name.in_(speakers)) if ignored_speakers: q = q.filter(c.phone.speaker.name.not_in_(ignored_speakers)) print("Applied filters") q = q.columns( c.phone.label.column_name('phone_label'), c.phone.begin.column_name('phone_begin'), c.phone.end.column_name('phone_end'), c.phone.duration.column_name('phone_duration'), c.phone.previous.label.column_name('previous_phone'), c.phone.following.label.column_name('following_phone'), c.phone.following.duration.column_name('following_duration'), c.phone.word.unisynprimstressedvowel1.column_name('word_unisyn'), c.phone.word.label.column_name('word_label'), c.phone.word.begin.column_name('word_begin'), c.phone.word.end.column_name('word_end'), c.phone.word.duration.column_name('word_duration'), c.phone.syllable.label.column_name('syllable_label'), c.phone.syllable.duration.column_name('syllable_duration'), c.phone.word.stresspattern.column_name('word_stresspattern'), c.phone.syllable.stress.column_name('syllable_stress'), c.phone.utterance.speech_rate.column_name('speech_rate'), c.phone.utterance.id.column_name('utterance_label'), c.phone.speaker.name.column_name('speaker_name'), c.phone.syllable.end.column_name('syllable_end'), c.phone.utterance.end.column_name('utterance_end')) for sp, _ in c.hierarchy.speaker_properties: if sp == 'name': continue q = q.columns(getattr(c.phone.speaker, sp).column_name(sp)) if c.hierarchy.has_token_property('word', 'surface_transcription'): print('getting underlying and surface transcriptions') q = q.columns( c.phone.word.transcription.column_name( 'word_underlying_transcription'), c.phone.word.surface_transcription.column_name( 'word_surface_transcription')) if c.hierarchy.has_type_property('word', 'containsvowelobstruent'): q = q.columns( c.phone.word.containsvowelobstruent.column_name( 'word_containsvowelobstruent')) # get baseline duration: # for most corpora this should be done over words # as buckeye has many-to-one correspondence between transcriptions and words # buckeye should have duration calculated over its underlying transcription if baseline: if not c.hierarchy.has_type_property('word', 'baseline'): print('getting baseline from word') c.encode_baseline('word', 'duration') q = q.columns( c.phone.word.baseline_duration.column_name( 'word_baseline_duration')) print("Writing CSV") q.to_csv(csv_path) end = time.time() time_taken = time.time() - beg print('Query took: {}'.format(end - beg)) print("Results for query written to " + csv_path) common.save_performance_benchmark(config, 'duration_export', time_taken)