def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) c.load(parser, corpus_dir)
def loading(config, corpus_dir, textgrid_format): """Load the corpus""" ## first check if a database for the corpus ## has already been created with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) ## if there is no database file, ## begin with importing the corpus textgrid_format = textgrid_format.upper() with CorpusContext(config) as c: print('loading') ## Use the appropriate importer based ## on the format of the corpus if textgrid_format in ["BUCKEYE", "B"]: parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "CSV": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() in ["FAVE", "F"]: parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ILG": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format in ["LABBCAT", "L"]: parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format in ["P", "PARTITUR"]: parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format in ["MAUS", "W"]: parser = pgio.inspect_maus(corpus_dir) elif textgrid_format in ["TIMIT", "T"]: parser = pgio.inspect_timit(corpus_dir) elif textgrid_format in ["W", "maus"]: parser = pgio.inspect_maus(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) elif name == 'partitur': parser = inspect_partitur(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) self.actionCompleted.emit('importing corpus') return could_not_parse
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host='localhost', graph_port=7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back=self.kwargs['call_back'], stop_check=self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) return could_not_parse
def test_buckeye_pause(graph_db, buckeye_test_dir): from polyglotdb.io import inspect_buckeye import os with CorpusContext('discourse_buckeye', **graph_db) as c: c.reset() word_path = os.path.join(buckeye_test_dir, 'test.words') parser = inspect_buckeye(word_path) c.load(parser, word_path) c.encode_pauses('^[<{].*$')
def test_average_speech_rate_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as c: c.reset() parser = inspect_buckeye(buckeye_test_dir) c.load(parser, buckeye_test_dir) c.encode_utterances() res = c.average_speech_rate() print(res) assert (abs(res[0][1] - 2.4439013552543876) < .0000000000001) assert (len(res) == 1)
def test_average_speech_rate_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as c: c.reset() parser = inspect_buckeye(buckeye_test_dir) c.load(parser, buckeye_test_dir) c.encode_utterances() res = c.average_speech_rate() print(res) assert(abs(res[0][1]-2.4439013552543876) < .0000000000001) assert(len(res)==1)
def test_load_directory_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as c: c.reset() parser = inspect_buckeye(buckeye_test_dir) c.load(parser, buckeye_test_dir) q = c.query_graph(c.phone).filter(c.phone.label == 's') assert (q.count() == 3) q = q.columns(c.phone.speaker.name.column_name('speaker')) print(q.cypher()) results = q.all() print(results) assert (all(x['speaker'] == 'tes' for x in results))
def test_load_directory_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as c: c.reset() parser = inspect_buckeye(buckeye_test_dir) c.load(parser, buckeye_test_dir) q = c.query_graph(c.surface_transcription).filter(c.surface_transcription.label == 's') assert(q.count() == 3) q = q.columns(c.surface_transcription.speaker.name.column_name('speaker')) print(q.cypher()) results = q.all() print(results) assert(all(x.speaker == 'tes' for x in results))
def test_load_discourse_buckeye(graph_db, buckeye_test_dir): with CorpusContext('discourse_buckeye', **graph_db) as c: c.reset() word_path = os.path.join(buckeye_test_dir,'test.words') parser = inspect_buckeye(word_path) c.load(parser, word_path) q = c.query_graph(c.phone).filter(c.phone.label == 's') assert(q.count() == 3) q = q.columns(c.phone.speaker.name.column_name('speaker')) print(q.cypher()) results = q.all() print(results) assert(all(x['speaker'] == 'tes' for x in results))
def import_corpus_run_query(data, path): with CorpusContext(data, **graph_db) as c: c.reset() beg = time.time() if data == 'buckeyebenchmark': parser = inspect_buckeye(path) elif data == 'timitbenchmark': parser = inspect_timit(path) else: parser = inspect_mfa(path) parser.call_back = call_back c.load(parser, path) end = time.time() avgtime = sum(times) / (len(times)) sd = statistics.stdev(times) return [(end - beg), avgtime, sd]
def test_phone_mean_duration_speaker_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as g: g.reset() parser = inspect_buckeye(buckeye_test_dir) g.load(parser, buckeye_test_dir) res = g.get_measure('duration', 'mean', 'phone') print(res) assert (len(res) == 17) dx, eh = 0, 0 for i, r in enumerate(res): if r[0] == 'dx': dx = i if r[0] == 'eh': eh = i assert res[dx][1] == approx(0.029999999999999805, 1e-3) assert res[eh][1] == approx(0.04932650000000005, 1e-3)
def import_corpus_run_query(data, path): with CorpusContext(data, **graph_db) as c: c.reset() beg = time.time() if data == 'buckeyebenchmark': parser = inspect_buckeye(path) elif data == 'timitbenchmark': parser = inspect_timit(path) else: parser = inspect_mfa(path) parser.call_back = call_back c.load(parser, path) end = time.time() avgtime = sum(times)/(len(times)) sd = statistics.stdev(times) return [(end - beg), avgtime, sd]
def test_syllable_mean_duration_with_speaker_buckeye(graph_db, buckeye_test_dir): syllabics = ['ae','aa','uw','ay','eh', 'ih', 'aw', 'ey', 'iy', 'uh','ah','ao','er','ow'] with CorpusContext('directory_buckeye', **graph_db) as g: g.reset() parser = inspect_buckeye(buckeye_test_dir) g.load(parser, buckeye_test_dir) g.encode_syllabic_segments(syllabics) g.encode_syllables() res = g.syllable_mean_duration_with_speaker() print(res) assert(len(res) == 11) for i, r in enumerate(res): if r[1] == 'dh.ae.s': break assert(abs(res[i][2]-0.17030199999999995) < .0000000000001)
def test_average_speech_rate_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as c: c.reset() parser = inspect_buckeye(buckeye_test_dir) c.load(parser, buckeye_test_dir) with pytest.raises(GraphQueryError): res = c.average_speech_rate() c.encode_pauses('^[{<].*$') c.encode_utterances(min_pause_length=0) with pytest.raises(GraphQueryError): res = c.average_speech_rate() c.encode_syllabic_segments(['eh', 'ae', 'ah', 'er', 'ey', 'ao']) c.encode_syllables('maxonset') res = c.average_speech_rate() print(res) assert res[0][1] == approx(5.929060725, 1e-3) assert (len(res) == 1)
def test_load_discourse_buckeye(graph_db, buckeye_test_dir): with CorpusContext('discourse_buckeye', **graph_db) as c: c.reset() word_path = os.path.join(buckeye_test_dir, 'test.words') parser = inspect_buckeye(word_path) c.load(parser, word_path) assert (c.hierarchy.has_type_property('word', 'transcription')) q = c.query_graph(c.phone).filter(c.phone.label == 's') assert (q.count() == 3) q = q.columns(c.phone.speaker.name.column_name('speaker')) print(q.cypher()) results = q.all() print(results) assert (all(x['speaker'] == 'tes' for x in results))
def test_load_directory_buckeye(graph_db, buckeye_test_dir): with CorpusContext('directory_buckeye', **graph_db) as c: c.reset() parser = inspect_buckeye(buckeye_test_dir) c.load(parser, buckeye_test_dir) q1 = c.query_graph(c.word).filter(c.word.label == 'that\'s') assert(q1.count() == 2) q = c.query_graph(c.phone).filter(c.phone.label == 's') assert(q.count() == 3) q = q.columns(c.phone.speaker.name.column_name('speaker')) print(q.cypher()) results = q.all() print(results) assert(all(x['speaker'] == 'tes' for x in results))
def test_syllable_mean_duration_with_speaker_buckeye(graph_db, buckeye_test_dir): syllabics = [ 'ae', 'aa', 'uw', 'ay', 'eh', 'ih', 'aw', 'ey', 'iy', 'uh', 'ah', 'ao', 'er', 'ow' ] with CorpusContext('directory_buckeye', **graph_db) as g: g.reset() parser = inspect_buckeye(buckeye_test_dir) g.load(parser, buckeye_test_dir) g.encode_syllabic_segments(syllabics) g.encode_syllables() res = g.get_measure('duration', 'mean', 'syllable', True) print(res) assert (len(res) == 11) for i, r in enumerate(res): if r[1] == 'dh.ae.s': break assert (abs(res[i][2] - 0.17030199999999995) < .0000000000001)
sys.path.insert(0, base) import polyglotdb.io as pgio from speechtools.corpus import CorpusContext path_to_buckeye = r'D:\Data\VIC\Speakers' #path_to_buckeye = r'D:\Data\BuckeyeSubset' graph_db = { 'host': 'localhost', 'port': 7474, 'user': '******', 'password': '******' } def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) with CorpusContext('buckeye', **graph_db) as c: c.reset() beg = time.time() parser = pgio.inspect_buckeye(path_to_buckeye) parser.call_back = call_back c.load(parser, path_to_buckeye) end = time.time() print('Time taken: {}'.format(end - beg))
import sys import os import time base = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0,base) import polyglotdb.io as pgio from speechtools.corpus import CorpusContext path_to_buckeye = r'D:\Data\VIC\Speakers' #path_to_buckeye = r'D:\Data\BuckeyeSubset' graph_db = {'host':'localhost', 'port': 7474, 'user': '******', 'password': '******'} def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) with CorpusContext('buckeye', **graph_db) as c: c.reset() beg = time.time() parser = pgio.inspect_buckeye(path_to_buckeye) parser.call_back = call_back c.load(parser, path_to_buckeye) end = time.time() print('Time taken: {}'.format(end - beg))