def test_two_format_parsing(mfa_test_dir, graph_db): #for file in os.listdir(os.path.abspath(mfa_test_dir)): # if file.endswith("yes.TextGrid") or file.endswith("no.TextGrid"): # path = os.path.join(mfa_test_dir, file) #parser = MfaParser("a", "b") # curTg = TextGrid() # curTg.read(path) #value = parser._is_valid(curTg) #if file.endswith("yes.TextGrid"): # assert True #elif file.endswith("no.TextGrid"): # assert False valid_dir = os.path.join(mfa_test_dir, "valid") invalid_dir = os.path.join(mfa_test_dir, "invalid") # Check that valids load with CorpusContext('mfa_valid', **graph_db) as c: c.reset() parser = inspect_mfa(valid_dir) c.load(parser, valid_dir) # Check that invalids don't with CorpusContext('mfa_invalid', **graph_db) as c: c.reset() parser = inspect_mfa(invalid_dir) with pytest.raises(ParseError): c.load(parser, invalid_dir)
def test_mismatch_parser(timit_test_dir, graph_db): with CorpusContext('test_mismatch', **graph_db) as c: c.reset() parser = inspect_mfa(timit_test_dir) with pytest.raises(ParseError): c.load(parser, timit_test_dir)
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host='localhost', graph_port=7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back=self.kwargs['call_back'], stop_check=self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) return could_not_parse
def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) c.load(parser, corpus_dir)
def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) elif name == 'partitur': parser = inspect_partitur(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) self.actionCompleted.emit('importing corpus') return could_not_parse
def test_load_mfa(mfa_test_dir, graph_db): with CorpusContext('test_mfa', **graph_db) as c: c.reset() parser = inspect_mfa(mfa_test_dir) c.load(parser, mfa_test_dir) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'mfa') q = q.order_by(c.word.begin) q = q.columns(c.word.label) results = q.all() assert(len(results) == 1) c.encode_pauses('<SIL>') c.encode_utterances(min_pause_length = 0) q = c.query_graph(c.word).filter(c.word.label == 'PLANET') q = q.filter(c.word.speaker.name == 'mfa') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) results = q.all() assert(len(results) == 1) assert(results[0]['following'] == 'JURASSIC') s = c.census['mfa'] assert(len(s.discourses) == 1) assert([x.discourse.name for x in s.discourses] == ['mfa_test'])
def test_load_mfa(mfa_test_dir, graph_db): with CorpusContext('test_mfa', **graph_db) as c: c.reset() parser = inspect_mfa(mfa_test_dir) c.load(parser, mfa_test_dir) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'mfa') q = q.order_by(c.word.begin) q = q.columns(c.word.label) results = q.all() assert (len(results) == 1) c.encode_pauses('<SIL>') c.encode_utterances(min_pause_length=0) q = c.query_graph(c.word).filter(c.word.label == 'PLANET') q = q.filter(c.word.speaker.name == 'mfa') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) results = q.all() assert (len(results) == 1) assert (results[0]['following'] == 'JURASSIC') s = c.census['mfa'] assert (len(s.discourses) == 1) assert ([x.discourse.name for x in s.discourses] == ['mfa_test'])
def stressed_config(graph_db, textgrid_test_dir): config = CorpusConfig('stressed', **graph_db) stressed_path = os.path.join(textgrid_test_dir, 'stressed_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_mfa(stressed_path) c.load(parser, stressed_path) return config
def stressed_config(graph_db, textgrid_test_dir): config = CorpusConfig('stressed', **graph_db) stressed_path = os.path.join(textgrid_test_dir,'stressed_corpus.TextGrid') with CorpusContext(config) as c: c.reset() parser = inspect_mfa(stressed_path) c.load(parser, stressed_path) return config
def loading(config, corpus_dir, textgrid_format): """Load the corpus""" ## first check if a database for the corpus ## has already been created with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) ## if there is no database file, ## begin with importing the corpus textgrid_format = textgrid_format.upper() with CorpusContext(config) as c: print('loading') ## Use the appropriate importer based ## on the format of the corpus if textgrid_format in ["BUCKEYE", "B"]: parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "CSV": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() in ["FAVE", "F"]: parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ILG": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format in ["LABBCAT", "L"]: parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format in ["P", "PARTITUR"]: parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format in ["MAUS", "W"]: parser = pgio.inspect_maus(corpus_dir) elif textgrid_format in ["TIMIT", "T"]: parser = pgio.inspect_timit(corpus_dir) elif textgrid_format in ["W", "maus"]: parser = pgio.inspect_maus(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def import_corpus_run_query(data, path): with CorpusContext(data, **graph_db) as c: c.reset() beg = time.time() if data == 'buckeyebenchmark': parser = inspect_buckeye(path) elif data == 'timitbenchmark': parser = inspect_timit(path) else: parser = inspect_mfa(path) parser.call_back = call_back c.load(parser, path) end = time.time() avgtime = sum(times) / (len(times)) sd = statistics.stdev(times) return [(end - beg), avgtime, sd]
def import_corpus_run_query(data, path): with CorpusContext(data, **graph_db) as c: c.reset() beg = time.time() if data == 'buckeyebenchmark': parser = inspect_buckeye(path) elif data == 'timitbenchmark': parser = inspect_timit(path) else: parser = inspect_mfa(path) parser.call_back = call_back c.load(parser, path) end = time.time() avgtime = sum(times)/(len(times)) sd = statistics.stdev(times) return [(end - beg), avgtime, sd]
def overlapped_config(graph_db, textgrid_test_dir, acoustic_syllabics): config = CorpusConfig('overlapped', **graph_db) acoustic_path = os.path.join(textgrid_test_dir, 'overlapped_speech') with CorpusContext(config) as c: c.reset() parser = inspect_mfa(acoustic_path) c.load(parser, acoustic_path) c.encode_pauses(['sil']) c.encode_utterances(min_pause_length=0) c.encode_syllabic_segments(acoustic_syllabics) c.encode_syllables() config.pitch_algorithm = 'acousticsim' config.formant_source = 'acousticsim' return config
def test_load_mfa(mfa_test_dir, graph_db): with CorpusContext('test_mfa', **graph_db) as c: c.reset() testFilePath = os.path.join(mfa_test_dir, "mfa_test.TextGrid") parser = inspect_mfa(testFilePath) print(parser.speaker_parser) c.load(parser, testFilePath) assert (c.hierarchy.has_type_property('word', 'transcription')) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') print(q) print(q.all()) q = q.filter(c.word.speaker.name == 'mfa') #print(c.word.speaker.name) print(q.all()) q = q.order_by(c.word.begin) print(q.all()) q = q.columns(c.word.label) print(q.all()) results = q.all() assert (len(results) == 1) c.encode_pauses('<SIL>') c.encode_utterances(min_pause_length=0) q = c.query_graph(c.word).filter(c.word.label == 'PLANET') q = q.filter(c.word.speaker.name == 'mfa') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) results = q.all() assert (len(results) == 1) assert (results[0]['following'] == 'JURASSIC') q = c.query_speakers().filter(c.speaker.name == 'mfa') q = q.columns(c.speaker.discourses.name.column_name('discourses')) s = q.get() assert (len(s['discourses']) == 1) assert (s['discourses'] == ['mfa_test'])
def test_load_discourse(graph_db, mfa_test_dir, textgrid_test_dir): test_file_path = os.path.join(mfa_test_dir, "mfa_test.TextGrid") acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid') mfa_parser = inspect_mfa(test_file_path) parser = inspect_textgrid(acoustic_path) with CorpusContext('load_remove_test', **graph_db) as c: c.reset() c.load_discourse(parser, acoustic_path) c.load_discourse(mfa_parser, test_file_path) syllabics = ['ER', 'AE', 'IH', 'EH', 'ae', 'ih', 'er', 'eh'] c.encode_syllabic_segments(syllabics) c.encode_syllables() q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') assert q.count() > 0 q = c.query_graph(c.phone).filter(c.phone.label == 'AE') assert q.count() > 0 q = c.query_lexicon(c.syllable).filter(c.syllable.label == 'JH.ER') assert q.count() > 0 q = c.query_lexicon( c.lexicon_word).filter(c.lexicon_word.label == 'JURASSIC') assert q.count() > 0 q = c.query_lexicon( c.lexicon_phone).filter(c.lexicon_phone.label == 'AE') assert q.count() > 0 q = c.query_lexicon( c.lexicon_phone).filter(c.lexicon_phone.label == 'ae') assert q.count() > 0 q = c.query_lexicon( c.lexicon_syllable).filter(c.lexicon_syllable.label == 'JH.ER') assert q.count() > 0 q = c.query_discourses().filter(c.discourse.name == 'mfa_test') assert q.count() > 0 q = c.query_speakers().filter(c.speaker.name == 'mfa') assert q.count() > 0 d = c.discourse_sound_file('acoustic_corpus') assert os.path.exists(d['consonant_file_path'])
from polyglotdb.utils import get_corpora_list graph_db = ({ 'graph_host': 'localhost', 'graph_port': 7474, 'graph_user': '******', 'graph_password': '******' }) def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) if __name__ == '__main__': with CorpusContext("Hillenbrand", **graph_db) as c: print("Loading...") c.reset() parser = pgio.inspect_mfa( '/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav') parser.call_back = call_back #beg = time.time() c.load(parser, '/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav') #end = time.time() #time = end-beg #logger.info('Loading took: ' + str(time))
from polyglotdb import CorpusContext import polyglotdb.io as pgio # change this path to where you put the pg_tutorial directory after downloading, unzipping from tutorial site corpus_root = '/mnt/e/Data/pg_tutorial' parser = pgio.inspect_mfa(corpus_root) parser.call_back = print # for verbose output during corpus import: parser.call_back = print with CorpusContext('pg_tutorial') as c: c.load(parser, corpus_root) # Simple queries ## uncomment the following to carry out the "Testing some simple queries" part: with CorpusContext('pg_tutorial') as c: print('Speakers:', c.speakers) print('Discourses:', c.discourses) q = c.query_lexicon(c.lexicon_phone) q = q.order_by(c.lexicon_phone.label) q = q.columns(c.lexicon_phone.label.column_name('phone')) results = q.all() print(results) from polyglotdb.query.base.func import Count, Average
import logging import polyglotdb.io as pgio from polyglotdb import CorpusContext from polyglotdb.config import CorpusConfig from polyglotdb.io.parsers import FilenameSpeakerParser from polyglotdb.io.enrichment import enrich_speakers_from_csv, enrich_lexicon_from_csv from polyglotdb.utils import get_corpora_list graph_db = ({'graph_host':'localhost', 'graph_port': 7474, 'graph_user': '******', 'graph_password': '******'}) def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) if __name__ == '__main__': with CorpusContext("Hillenbrand", **graph_db) as c: print ("Loading...") c.reset() parser = pgio.inspect_mfa('/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav') parser.call_back = call_back #beg = time.time() c.load(parser, '/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav') #end = time.time() #time = end-beg #logger.info('Loading took: ' + str(time))