예제 #1
0
def loading(config, corpus_dir, textgrid_format):
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)
    with CorpusContext(config) as c:
        print('loading')

        if textgrid_format == "buckeye":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "csv":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() == "fave":
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ilg":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format == "labbcat":
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format == "partitur":
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format == "timit":
            parser = pgio.inspect_timit(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        time_taken = end - beg
        print('Loading took: {}'.format(time_taken))
    save_performance_benchmark(config, 'import', time_taken)
예제 #2
0
def loading(config, corpus_dir, textgrid_format):
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)
    with CorpusContext(config) as c:
        print('loading')

        if textgrid_format == "buckeye":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "csv":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() == "fave":
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ilg":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format == "labbcat":
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format == "partitur":
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format == "timit":
            parser = pgio.inspect_timit(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        c.load(parser, corpus_dir)
예제 #3
0
def loading(config, corpus_dir, textgrid_format):
    """Load the corpus"""

    ## first check if a database for the corpus
    ## has already been created
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)

    ## if there is no database file,
    ## begin with importing the corpus
    textgrid_format = textgrid_format.upper()
    with CorpusContext(config) as c:
        print('loading')

        ## Use the appropriate importer based
        ## on the format of the corpus
        if textgrid_format in ["BUCKEYE", "B"]:
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "CSV":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() in ["FAVE", "F"]:
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ILG":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format in ["LABBCAT", "L"]:
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format in ["P", "PARTITUR"]:
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format in ["MAUS", "W"]:
            parser = pgio.inspect_maus(corpus_dir)
        elif textgrid_format in ["TIMIT", "T"]:
            parser = pgio.inspect_timit(corpus_dir)
        elif textgrid_format in ["W", "maus"]:
            parser = pgio.inspect_maus(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        time_taken = end - beg
        print('Loading took: {}'.format(time_taken))
    save_performance_benchmark(config, 'import', time_taken)
    def run_query(self):
        time.sleep(0.1)
        name = self.kwargs['name']
        directory = self.kwargs['directory']
        reset = True
        config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474)
        with CorpusContext(config) as c:
            if name == 'buckeye':
                parser = inspect_buckeye(directory)
            elif name == 'timit':
                parser = inspect_timit(directory)
            elif name == 'partitur':
                parser = inspect_partitur(directory)
            else:
                form = guess_textgrid_format(directory)
                if form == 'labbcat':
                    parser = inspect_labbcat(directory)
                elif form == 'mfa':
                    parser = inspect_mfa(directory)
                elif form == 'fave':
                    parser = inspect_fave(directory)
                else:
                    parser = inspect_textgrid(directory)

            parser.call_back = self.kwargs['call_back']
            parser.stop_check = self.kwargs['stop_check']
            parser.call_back('Resetting corpus...')
            if reset:
                c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check'])
            could_not_parse = c.load(parser, directory)
            self.actionCompleted.emit('importing corpus') 
        return could_not_parse
예제 #5
0
    def run_query(self):
        time.sleep(0.1)
        name = self.kwargs['name']
        directory = self.kwargs['directory']
        reset = True
        config = CorpusConfig(name, graph_host='localhost', graph_port=7474)
        with CorpusContext(config) as c:
            if name == 'buckeye':
                parser = inspect_buckeye(directory)
            elif name == 'timit':
                parser = inspect_timit(directory)
            else:
                form = guess_textgrid_format(directory)
                if form == 'labbcat':
                    parser = inspect_labbcat(directory)
                elif form == 'mfa':
                    parser = inspect_mfa(directory)
                elif form == 'fave':
                    parser = inspect_fave(directory)
                else:
                    parser = inspect_textgrid(directory)

            parser.call_back = self.kwargs['call_back']
            parser.stop_check = self.kwargs['stop_check']
            parser.call_back('Resetting corpus...')
            if reset:
                c.reset(call_back=self.kwargs['call_back'],
                        stop_check=self.kwargs['stop_check'])
            could_not_parse = c.load(parser, directory)
        return could_not_parse
예제 #6
0
def test_buckeye_pause(graph_db, buckeye_test_dir):
    from polyglotdb.io import inspect_buckeye
    import os
    with CorpusContext('discourse_buckeye', **graph_db) as c:
        c.reset()
        word_path = os.path.join(buckeye_test_dir, 'test.words')
        parser = inspect_buckeye(word_path)
        c.load(parser, word_path)
        c.encode_pauses('^[<{].*$')
예제 #7
0
def test_average_speech_rate_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as c:
        c.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        c.load(parser, buckeye_test_dir)
        c.encode_utterances()
        res = c.average_speech_rate()
        print(res)
        assert (abs(res[0][1] - 2.4439013552543876) < .0000000000001)
        assert (len(res) == 1)
예제 #8
0
def test_average_speech_rate_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as c:
        c.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        c.load(parser, buckeye_test_dir)
        c.encode_utterances()
        res = c.average_speech_rate()
        print(res)
        assert(abs(res[0][1]-2.4439013552543876) < .0000000000001)
        assert(len(res)==1)
예제 #9
0
def test_load_directory_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as c:
        c.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        c.load(parser, buckeye_test_dir)

        q = c.query_graph(c.phone).filter(c.phone.label == 's')
        assert (q.count() == 3)

        q = q.columns(c.phone.speaker.name.column_name('speaker'))
        print(q.cypher())
        results = q.all()
        print(results)
        assert (all(x['speaker'] == 'tes' for x in results))
예제 #10
0
def test_load_directory_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as c:
        c.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        c.load(parser, buckeye_test_dir)

        q = c.query_graph(c.surface_transcription).filter(c.surface_transcription.label == 's')
        assert(q.count() == 3)

        q = q.columns(c.surface_transcription.speaker.name.column_name('speaker'))
        print(q.cypher())
        results = q.all()
        print(results)
        assert(all(x.speaker == 'tes' for x in results))
예제 #11
0
def test_load_discourse_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('discourse_buckeye', **graph_db) as c:
        c.reset()
        word_path = os.path.join(buckeye_test_dir,'test.words')
        parser = inspect_buckeye(word_path)
        c.load(parser, word_path)

        q = c.query_graph(c.phone).filter(c.phone.label == 's')
        assert(q.count() == 3)

        q = q.columns(c.phone.speaker.name.column_name('speaker'))
        print(q.cypher())
        results = q.all()
        print(results)
        assert(all(x['speaker'] == 'tes' for x in results))
예제 #12
0
def import_corpus_run_query(data, path):
    with CorpusContext(data, **graph_db) as c:
        c.reset()
        beg = time.time()
        if data == 'buckeyebenchmark':
            parser = inspect_buckeye(path)
        elif data == 'timitbenchmark':
            parser = inspect_timit(path)
        else:
            parser = inspect_mfa(path)
        parser.call_back = call_back
        c.load(parser, path)
        end = time.time()
        avgtime = sum(times) / (len(times))
        sd = statistics.stdev(times)
        return [(end - beg), avgtime, sd]
예제 #13
0
def test_phone_mean_duration_speaker_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as g:
        g.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        g.load(parser, buckeye_test_dir)
        res = g.get_measure('duration', 'mean', 'phone')
        print(res)
        assert (len(res) == 17)
        dx, eh = 0, 0
        for i, r in enumerate(res):
            if r[0] == 'dx':
                dx = i
            if r[0] == 'eh':
                eh = i
        assert res[dx][1] == approx(0.029999999999999805, 1e-3)
        assert res[eh][1] == approx(0.04932650000000005, 1e-3)
def import_corpus_run_query(data, path):
    with CorpusContext(data, **graph_db) as c:
        c.reset()
        beg = time.time()
        if data == 'buckeyebenchmark':
            parser = inspect_buckeye(path)
        elif data == 'timitbenchmark':
            parser = inspect_timit(path)
        else:
            parser = inspect_mfa(path)
        parser.call_back = call_back
        c.load(parser, path)
        end = time.time()
        avgtime = sum(times)/(len(times))
        sd = statistics.stdev(times)
        return [(end - beg), avgtime, sd]
예제 #15
0
def test_syllable_mean_duration_with_speaker_buckeye(graph_db, buckeye_test_dir):
    syllabics = ['ae','aa','uw','ay','eh', 'ih', 'aw', 'ey', 'iy',
                'uh','ah','ao','er','ow']
    with CorpusContext('directory_buckeye', **graph_db) as g:
        g.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        g.load(parser, buckeye_test_dir)
        g.encode_syllabic_segments(syllabics)
        g.encode_syllables()
        res = g.syllable_mean_duration_with_speaker()
        print(res)
        assert(len(res) == 11)
        for i, r in enumerate(res):
            if r[1] == 'dh.ae.s':
                break
        assert(abs(res[i][2]-0.17030199999999995) < .0000000000001)
예제 #16
0
def test_average_speech_rate_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as c:
        c.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        c.load(parser, buckeye_test_dir)
        with pytest.raises(GraphQueryError):
            res = c.average_speech_rate()
        c.encode_pauses('^[{<].*$')
        c.encode_utterances(min_pause_length=0)
        with pytest.raises(GraphQueryError):
            res = c.average_speech_rate()
        c.encode_syllabic_segments(['eh', 'ae', 'ah', 'er', 'ey', 'ao'])
        c.encode_syllables('maxonset')
        res = c.average_speech_rate()
        print(res)
        assert res[0][1] == approx(5.929060725, 1e-3)
        assert (len(res) == 1)
예제 #17
0
def test_load_discourse_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('discourse_buckeye', **graph_db) as c:
        c.reset()
        word_path = os.path.join(buckeye_test_dir, 'test.words')
        parser = inspect_buckeye(word_path)
        c.load(parser, word_path)

        assert (c.hierarchy.has_type_property('word', 'transcription'))

        q = c.query_graph(c.phone).filter(c.phone.label == 's')
        assert (q.count() == 3)

        q = q.columns(c.phone.speaker.name.column_name('speaker'))
        print(q.cypher())
        results = q.all()
        print(results)
        assert (all(x['speaker'] == 'tes' for x in results))
예제 #18
0
def test_load_directory_buckeye(graph_db, buckeye_test_dir):
    with CorpusContext('directory_buckeye', **graph_db) as c:
        c.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        c.load(parser, buckeye_test_dir)


        q1 = c.query_graph(c.word).filter(c.word.label == 'that\'s')
        assert(q1.count() == 2)

        q = c.query_graph(c.phone).filter(c.phone.label == 's')
        assert(q.count() == 3)
        
        q = q.columns(c.phone.speaker.name.column_name('speaker'))
        print(q.cypher())
        results = q.all()
        print(results)
        assert(all(x['speaker'] == 'tes' for x in results))
예제 #19
0
def test_syllable_mean_duration_with_speaker_buckeye(graph_db,
                                                     buckeye_test_dir):
    syllabics = [
        'ae', 'aa', 'uw', 'ay', 'eh', 'ih', 'aw', 'ey', 'iy', 'uh', 'ah', 'ao',
        'er', 'ow'
    ]
    with CorpusContext('directory_buckeye', **graph_db) as g:
        g.reset()
        parser = inspect_buckeye(buckeye_test_dir)
        g.load(parser, buckeye_test_dir)
        g.encode_syllabic_segments(syllabics)
        g.encode_syllables()
        res = g.get_measure('duration', 'mean', 'syllable', True)
        print(res)
        assert (len(res) == 11)
        for i, r in enumerate(res):
            if r[1] == 'dh.ae.s':
                break
        assert (abs(res[i][2] - 0.17030199999999995) < .0000000000001)
예제 #20
0
sys.path.insert(0, base)
import polyglotdb.io as pgio

from speechtools.corpus import CorpusContext

path_to_buckeye = r'D:\Data\VIC\Speakers'
#path_to_buckeye = r'D:\Data\BuckeyeSubset'

graph_db = {
    'host': 'localhost',
    'port': 7474,
    'user': '******',
    'password': '******'
}


def call_back(*args):
    args = [x for x in args if isinstance(x, str)]
    if args:
        print(' '.join(args))


with CorpusContext('buckeye', **graph_db) as c:
    c.reset()
    beg = time.time()
    parser = pgio.inspect_buckeye(path_to_buckeye)
    parser.call_back = call_back
    c.load(parser, path_to_buckeye)
    end = time.time()
    print('Time taken: {}'.format(end - beg))
예제 #21
0
import sys
import os
import time
base = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
sys.path.insert(0,base)
import polyglotdb.io as pgio

from speechtools.corpus import CorpusContext

path_to_buckeye = r'D:\Data\VIC\Speakers'
#path_to_buckeye = r'D:\Data\BuckeyeSubset'

graph_db = {'host':'localhost', 'port': 7474,
            'user': '******', 'password': '******'}

def call_back(*args):
    args = [x for x in args if isinstance(x, str)]
    if args:
        print(' '.join(args))

with CorpusContext('buckeye', **graph_db) as c:
    c.reset()
    beg = time.time()
    parser = pgio.inspect_buckeye(path_to_buckeye)
    parser.call_back = call_back
    c.load(parser, path_to_buckeye)
    end = time.time()
    print('Time taken: {}'.format(end - beg))