def text_test_dir(text_test_dir): transcription_path = os.path.join(text_test_dir, 'test_text_transcription.txt') with pytest.raises(DelimiterError): load_discourse_transcription('test', transcription_path," ",[], trans_delimiter = ',') c = load_discourse_transcription('test',transcription_path) assert(sorted(c.lexicon.inventory) == sorted(['#','a','b','c','d']))
def text_test_dir(text_test_dir): transcription_path = os.path.join(text_test_dir, 'test_text_transcription.txt') with pytest.raises(DelimiterError): load_discourse_transcription('test', transcription_path, " ", [], trans_delimiter=',') c = load_discourse_transcription('test', transcription_path) assert (sorted(c.lexicon.inventory) == sorted(['#', 'a', 'b', 'c', 'd']))
def test_load_transcription_morpheme(text_test_dir): transcription_morphemes_path = os.path.join(text_test_dir, 'test_text_transcription_morpheme_boundaries.txt') ats = inspect_discourse_transcription(transcription_morphemes_path) ats[0].morph_delimiters = set('-=') c = load_discourse_transcription('test',transcription_morphemes_path, ats) assert(c.lexicon['cab'].frequency == 2) assert(str(c.lexicon['cab'].transcription) == 'c.a-b')
def test_load_transcription_morpheme(text_test_dir): transcription_morphemes_path = os.path.join( text_test_dir, 'test_text_transcription_morpheme_boundaries.txt') ats = inspect_discourse_transcription(transcription_morphemes_path) ats[0].morph_delimiters = set('-=') c = load_discourse_transcription('test', transcription_morphemes_path, ats) assert (c.lexicon['cab'].frequency == 2) assert (str(c.lexicon['cab'].transcription) == 'c.a-b')
def test_export_transcription(export_test_dir, unspecified_test_corpus): d = generate_discourse(unspecified_test_corpus) export_path = os.path.join(export_test_dir, 'test_export_transcription.txt') export_discourse_transcription(d, export_path, single_line = False) d2 = load_discourse_transcription('test', export_path) words = sorted([x for x in unspecified_test_corpus], key = lambda x: x.transcription) words2 = sorted([x for x in d2.lexicon], key = lambda x: x.transcription) for i,w in enumerate(words): w2 = words2[i] assert(w.transcription == w2.transcription) assert(w.frequency == w2.frequency)
def run(self): time.sleep(0.1) textType = self.kwargs.pop('text_type') isDirectory = self.kwargs.pop('isDirectory') logging.info('Importing {} corpus named {}'.format( textType, self.kwargs['corpus_name'])) logging.info('Path: '.format(self.kwargs['path'])) log_annotation_types(self.kwargs['annotation_types']) try: if textType == 'spelling': if isDirectory: corpus = load_directory_spelling(**self.kwargs) else: corpus = load_discourse_spelling(**self.kwargs) elif textType == 'transcription': if isDirectory: corpus = load_directory_transcription(**self.kwargs) else: corpus = load_discourse_transcription(**self.kwargs) elif textType == 'ilg': if isDirectory: corpus = load_directory_ilg(**self.kwargs) else: corpus = load_discourse_ilg(**self.kwargs) elif textType == 'textgrid': if isDirectory: corpus = load_directory_textgrid(**self.kwargs) else: corpus = load_discourse_textgrid(**self.kwargs) elif textType == 'csv': corpus = load_corpus_csv(**self.kwargs) elif textType in ['buckeye', 'timit']: self.kwargs['dialect'] = textType if isDirectory: corpus = load_directory_multiple_files(**self.kwargs) else: corpus = load_discourse_multiple_files(**self.kwargs) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: time.sleep(0.1) self.finishedCancelling.emit() return self.dataReady.emit(corpus)
def run(self): time.sleep(0.1) textType = self.kwargs.pop('text_type') isDirectory = self.kwargs.pop('isDirectory') logging.info('Importing {} corpus named {}'.format(textType, self.kwargs['corpus_name'])) logging.info('Path: '.format(self.kwargs['path'])) log_annotation_types(self.kwargs['annotation_types']) try: if textType == 'spelling': if isDirectory: corpus = load_directory_spelling(**self.kwargs) else: corpus = load_discourse_spelling(**self.kwargs) elif textType == 'transcription': if isDirectory: corpus = load_directory_transcription(**self.kwargs) else: corpus = load_discourse_transcription(**self.kwargs) elif textType == 'ilg': if isDirectory: corpus = load_directory_ilg(**self.kwargs) else: corpus = load_discourse_ilg(**self.kwargs) elif textType == 'textgrid': if isDirectory: corpus = load_directory_textgrid(**self.kwargs) else: corpus = load_discourse_textgrid(**self.kwargs) elif textType == 'csv': corpus = load_corpus_csv(**self.kwargs) elif textType in ['buckeye', 'timit']: self.kwargs['dialect'] = textType if isDirectory: corpus = load_directory_multiple_files(**self.kwargs) else: corpus = load_discourse_multiple_files(**self.kwargs) except PCTError as e: self.errorEncountered.emit(e) return except Exception as e: e = PCTPythonError(e) self.errorEncountered.emit(e) return if self.stopped: time.sleep(0.1) self.finishedCancelling.emit() return self.dataReady.emit(corpus)
def test_export_transcription(export_test_dir, unspecified_test_corpus): d = generate_discourse(unspecified_test_corpus) export_path = os.path.join(export_test_dir, 'test_export_transcription.txt') export_discourse_transcription(d, export_path, single_line=False) d2 = load_discourse_transcription('test', export_path) words = sorted([x for x in unspecified_test_corpus], key=lambda x: x.transcription) words2 = sorted([x for x in d2.lexicon], key=lambda x: x.transcription) for i, w in enumerate(words): w2 = words2[i] assert (w.transcription == w2.transcription) assert (w.frequency == w2.frequency)