示例#1
0
def ensure_query_is_word(query,
                         corpus,
                         sequence_type,
                         tier_type,
                         trans_delimiter='.',
                         file_type=None):

    if isinstance(query, Word):
        query_word = query
    else:
        if tier_type.att_type == 'spelling':
            if file_type == sequence_type:
                query_word = Word(**{sequence_type: list(query)})
            else:
                query_word = query.replace(trans_delimiter, '')
                query_word = Word(**{sequence_type: list(query_word)})

        elif tier_type.att_type == 'tier':
            if file_type == sequence_type:
                new_query = parse(query, trans_delimiter)
                query_word = Word(**{sequence_type: new_query})
            else:
                try:
                    query_word = corpus.corpus.find(query)
                except KeyError:
                    new_query = parse(query, trans_delimiter)
                    query_word = Word(**{sequence_type: list(new_query)})

    return query_word
def test_init():
    word_type_only = {
        'begin': 0,
        'end': 1,
        'word': Word(**{
            'spelling': 'a',
            'transcription': ['a', 'b']
        })
    }

    word_type_and = {
        'begin': 0,
        'end': 1,
        'spelling': 'a2',
        'transcription': ['a', 'b2'],
        'word': Word(**{
            'spelling': 'a',
            'transcription': ['a', 'b']
        })
    }
    wt = WordToken(**word_type_only)
    assert (wt.spelling == 'a')
    assert (str(wt.transcription) == 'a.b')

    wt = WordToken(**word_type_and)
    assert (wt.spelling == 'a2')
    assert (str(wt.transcription) == 'a.b2')
示例#3
0
def ensure_query_is_word(query, corpus, sequence_type, trans_delimiter):
    if isinstance(query, Word):
        query_word = query
    else:
        try:
            query_word = corpus.corpus.find(query)
        except KeyError:
            if trans_delimiter == '':
                query_word = Word(**{sequence_type: list(query)})
            else:
                query_word = Word(**{sequence_type: query.split(trans_delimiter)})
    return query_word
示例#4
0
    def test_basic(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)
示例#5
0
    def test_homographs(self):
        return
        corpus = Corpus('test')
        for w in self.homograph_info:
            corpus.add_word(Word(**w))

        #Error, should find return an iterable of homographs?
        self.assertEqual([x.spelling for x in corpus.find('a')], ['a', 'a'])
示例#6
0
    def test_basic(self):
        corpus = Corpus('test')
        for w in self.basic_info:
            self.assertRaises(KeyError, corpus.find, w['spelling'], True)
            corpus.add_word(Word(**w))
            self.assertEqual(corpus[w['spelling']], Word(**w))
            self.assertEqual(corpus.find(w['spelling']), Word(**w))
            self.assertTrue(w['spelling'] in corpus)

        self.assertEqual(
            corpus.inventory._data, {
                '#': Segment('#'),
                'a': Segment('a'),
                'b': Segment('b'),
                'c': Segment('c'),
                'd': Segment('d')
            })
def test_init():
    word_tokens = [{
        'begin':
        0,
        'end':
        1,
        'word':
        Word(**{
            'spelling': 'a',
            'transcription': ['a', 'b']
        })
    }, {
        'begin':
        1,
        'end':
        2,
        'word':
        Word(**{
            'spelling': 'c',
            'transcription': ['c', 'a', 'b']
        })
    }, {
        'begin':
        2,
        'end':
        3,
        'word':
        Word(**{
            'spelling': 'a',
            'transcription': ['a', 'b']
        })
    }, {
        'begin':
        3,
        'end':
        4,
        'word':
        Word(**{
            'spelling': 'd',
            'transcription': ['a', 'd']
        })
    }]
    d = Discourse()
    for wt in word_tokens:
        d.add_word(WordToken(**wt))
示例#8
0
    def test_spelling_only_word(self):
        t = Word(**self.spelling_only)
        self.assertEqual(t.transcription, None)

        self.assertEqual(t.frequency, float(self.spelling_only['frequency']))

        self.assertEqual(t.spelling, self.spelling_only['spelling'])

        self.assertRaises(AttributeError, getattr, t, 'tier1')
def test_basic_corpus_mutation_minpairs(specified_test_corpus):
    calls = [({'query':Word(**{'transcription': ['s', 'ɑ', 't', 'ɑ']}),
                    },2)]

    with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c:
        for kwargs,v in calls:
            result = find_mutation_minpairs(c, **kwargs)
            assert(result[0] == v)
            assert(sorted(result[1]) == sorted(['n.ɑ.t.ɑ', 'm.ɑ.t.ɑ']))
示例#10
0
    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(), ['d'])
示例#11
0
    def test_no_freq_word(self):
        t = Word(**self.no_freq)
        self.assertEqual(str(t.transcription),
                         '.'.join(self.no_freq['transcription']))

        self.assertEqual(t.frequency, 0)

        self.assertEqual(t.spelling, self.no_freq['spelling'])

        self.assertRaises(AttributeError, getattr, t, 'tier1')
示例#12
0
    def test_trans_only_word(self):
        t = Word(**self.trans_only)
        self.assertEqual(str(t.transcription),
                         '.'.join(self.trans_only['transcription']))

        self.assertEqual(t.frequency, float(self.trans_only['frequency']))

        self.assertEqual(t.spelling, 'abcd')

        self.assertRaises(AttributeError, getattr, t, 'tier1')
示例#13
0
    def test_basic_word(self):
        t = Word(**self.basic)
        self.assertEqual(str(t.transcription),
                         '.'.join(self.basic['transcription']))

        self.assertEqual(t.frequency, float(self.basic['frequency']))

        self.assertEqual(t.spelling, self.basic['spelling'])

        self.assertRaises(AttributeError, getattr, t, 'tier1')
示例#14
0
    def test_feats_to_segs(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])),
                         sorted(['a', 'b']))
示例#15
0
    def test_tiered_word(self):
        t = Word(**self.tiered)

        self.assertEqual(str(t.transcription),
                         '.'.join(self.tiered['transcription']))

        self.assertEqual(t.frequency, float(self.tiered['frequency']))

        self.assertEqual(t.spelling, self.tiered['spelling'])

        self.assertEqual(t.tier1, self.tiered['tier1'])

        self.assertEqual(t.tier2, self.tiered['tier2'])
示例#16
0
    def test_extra_word(self):
        t = Word(**self.extra)

        self.assertEqual(str(t.transcription),
                         '.'.join(self.extra['transcription']))

        self.assertEqual(t.spelling, self.extra['spelling'])

        self.assertEqual(t.frequency, float(self.extra['frequency']))

        self.assertEqual(t.num_sylls, float(self.extra['num_sylls']))

        self.assertEqual(t.some_other_label, self.extra['some_other_label'])
示例#17
0
def load_words_neighden(path):
    output = list()
    with open(path, 'r') as f:
        for line in f:
            fields = [x for x in line.strip().split(None) if x != '']
            if len(fields) > 1:
                fields[1] = fields[1].split('.')
                fields = Word(spelling=fields[0], transcription=fields[1])
            elif len(fields) == 1:
                fields = fields[0]
            else:
                continue
            output.append(fields)
    return output
示例#18
0
    def setUp(self):
        self.corpus_info = [
            {
                'spelling': 'a',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'b',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'c',
                'transcription': ['c', 'a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'd',
                'transcription': ['a', 'd'],
                'frequency': 32.0
            },
        ]

        self.feature_info = [{
            'symbol': 'a',
            'feature1': '+',
            'feature2': '+'
        }, {
            'symbol': 'b',
            'feature1': '+',
            'feature2': '-'
        }, {
            'symbol': 'c',
            'feature1': '-',
            'feature2': '+'
        }, {
            'symbol': 'd',
            'feature1': '-',
            'feature2': '-'
        }]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_info)

        self.corpus.set_feature_matrix(fm)
        self.corpus.inventory.update_features(self.corpus.specifier)
示例#19
0
    def test_add_tier(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        corpus.add_tier('t', '+feature1')
        self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]])

        corpus.remove_attribute('t')

        self.assertRaises(AttributeError, getattr, corpus['d'], 't')
示例#20
0
    def accept(self):

        kwargs = {}

        for a in self.corpus.attributes:
            if a.att_type == 'tier':
                text = self.edits['transcription'].text()
                if text == 'Empty':
                    text = ''
                kwargs[a.name] = [x for x in text.split('.') if x != '']
                # if not kwargs[a.name]:
                #    reply = QMessageBox.critical(self,
                #            "Missing information", "Words must have a Transcription.".format(str(a)))
                #    return

                for i in list(dict.fromkeys(kwargs[a.name])):
                    if i not in self.inventory.segs:
                        reply = QMessageBox.critical(self,
                                                     'Invalid information',
                                                     'The transcription can only contain only symbols '
                                                     'from the corpus\' inventory.'.format(str(a)))
                        return
            elif a.att_type == 'spelling':
                kwargs[a.name] = self.edits['spelling'].text()
                if kwargs[a.name] == '':  # and a.name == 'spelling':
                    kwargs[a.name] = None
                if a.name != 'Spelling':
                    kwargs['_spelling_name'] = a.name
                # if not kwargs[a.name] and a.name == 'spelling':
                #    reply = QMessageBox.critical(self,
                #            "Missing information", "Words must have a spelling.".format(str(a)))
                #    return
            elif a.att_type == 'numeric' and (hasattr(a, 'is_freq') or a.display_name == 'Frequency'):
                try:
                    kwargs[a.name] = float(self.edits['frequency'].text())
                except ValueError:
                    reply = QMessageBox.critical(self,
                                                 "Invalid information",
                                                 "The column '{}' must be a number.".format(str(a)))
                    return
                if a.name != 'Frequency':
                    kwargs['_freq_name'] = a.name

            elif a.att_type == 'factor':
                kwargs[a.name] = self.edits[a.name].text()
        self.word = Word(**kwargs)
        QDialog.accept(self)
示例#21
0
def test_corpus_model(qtbot, specified_test_corpus, settings):
    model = CorpusModel(specified_test_corpus, settings)
    assert(model.headerData(0,Qt.Horizontal,Qt.DisplayRole) == 'Spelling')
    assert(model.headerData(1,Qt.Horizontal,Qt.DisplayRole) == 'Transcription')
    assert(model.headerData(2,Qt.Horizontal,Qt.DisplayRole) == 'Frequency')

    a = Attribute('test', 'spelling','Test2')

    model.addColumn(a)
    assert(model.headerData(3,Qt.Horizontal,Qt.DisplayRole) == 'Test2')

    model.removeAttributes(['Test2'])
    assert(len(model.columns) == 3)

    a = Attribute('test','factor','Test')

    model.addAbstractTier(a, {'C':['t','m']})
    assert(model.wordObject(0).test == 'CC')
    model.removeAttributes(['Test'])

    a = Attribute('test','numeric','Test')

    model.addCountColumn(a, 'transcription', ['t','m'])
    assert(model.wordObject(0).test == 2)
    model.removeAttributes(['Test'])

    a = Attribute('test','tier','Test')

    model.addTier(a, ['t','m'])
    assert(model.wordObject(0).test == ['t','m'])
    model.removeAttributes(['Test'])

    w = model.wordObject(0)
    assert(w.spelling == 'atema')
    w = Word(spelling = 'atema', transcription = [])
    model.replaceWord(0, w)
    w = model.wordObject(0)
    assert(w.spelling == 'atema' and w.transcription == [])
    model.hideNonLexical(True)
    w = model.wordObject(0)
    assert(w.spelling != 'atema')
    model.hideNonLexical(False)
    w = model.wordObject(0)
    assert(w.spelling == 'atema')
示例#22
0
    def accept(self):

        kwargs = {}

        for a in self.corpus.attributes:
            if a.att_type == 'tier':
                text = self.edits[a.name].text()
                if text == 'Empty':
                    text = ''
                kwargs[a.name] = [x for x in text.split('.') if x != '']
                #if not kwargs[a.name]:
                #    reply = QMessageBox.critical(self,
                #            "Missing information", "Words must have a Transcription.".format(str(a)))
                #    return

                for i in kwargs[a.name]:
                    if i not in self.corpus.inventory:
                        reply = QMessageBox.critical(
                            self, "Invalid information",
                            "The column '{}' must contain only symbols in the corpus' inventory."
                            .format(str(a)))
                        return
            elif a.att_type == 'spelling':
                kwargs[a.name] = self.edits[a.name].text()
                if kwargs[a.name] == '' and a.name == 'spelling':
                    kwargs[a.name] = None
                #if not kwargs[a.name] and a.name == 'spelling':
                #    reply = QMessageBox.critical(self,
                #            "Missing information", "Words must have a spelling.".format(str(a)))
                #    return
            elif a.att_type == 'numeric':
                try:
                    kwargs[a.name] = float(self.edits[a.name].text())
                except ValueError:
                    reply = QMessageBox.critical(
                        self, "Invalid information",
                        "The column '{}' must be a number.".format(str(a)))
                    return

            elif a.att_type == 'factor':
                kwargs[a.name] = self.edits[a.name].text()
        self.word = Word(**kwargs)
        QDialog.accept(self)
示例#23
0
def unspecified_test_corpus():
    # Segments: ɑ, i, u, e, o, ʃ, t, m, n, s (10 segments)
    corpus_data = [{'spelling':'atema','transcription':['ɑ','t','e','m','ɑ'],'frequency':11.0},
                    {'spelling':'enuta','transcription':['e','n','u','t','ɑ'],'frequency':11.0},
                    {'spelling':'mashomisi','transcription':['m','ɑ','ʃ','o','m','i','s','i'],'frequency':5.0},
                    {'spelling':'mata','transcription':['m','ɑ','t','ɑ'],'frequency':2.0},
                    {'spelling':'nata','transcription':['n','ɑ','t','ɑ'],'frequency':2.0},
                    {'spelling':'sasi','transcription':['s','ɑ','s','i'],'frequency':139.0},
                    {'spelling':'shashi','transcription':['ʃ','ɑ','ʃ','i'],'frequency':43.0},
                    {'spelling':'shisata','transcription':['ʃ','i','s','ɑ','t','ɑ'],'frequency':3.0},
                    {'spelling':'shushoma','transcription':['ʃ','u','ʃ','o','m','ɑ'],'frequency':126.0},
                    {'spelling':'ta','transcription':['t','ɑ'],'frequency':67.0},
                    {'spelling':'tatomi','transcription':['t','ɑ','t','o','m','i'],'frequency':7.0},
                    {'spelling':'tishenishu','transcription':['t','i','ʃ','e','n','i','ʃ','u'],'frequency':96.0},
                    {'spelling':'toni','transcription':['t','o','n','i'],'frequency':33.0},
                    {'spelling':'tusa','transcription':['t','u','s','ɑ'],'frequency':32.0},
                    {'spelling':'ʃi','transcription':['ʃ','i'],'frequency':2.0}]
    corpus = Corpus('test')
    for w in corpus_data:
        corpus.add_word(Word(**w))
    return corpus
示例#24
0
def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, _ = inspect_csv(path, coldelim = delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus
示例#25
0
def load_discourse_multiple_files(corpus_name,
                                  word_path,
                                  phone_path,
                                  dialect,
                                  annotation_types=None,
                                  lexicon=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {
        'name': name,
        'wav_path': find_wav_path(word_path),
        'other_attributes': list()
    }
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute  #.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs[
                'transcription_name'] = at.attribute  #.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in (
                'tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {
            at.output_name: (at.attribute, w[at.output_name])
            for at in annotation_types
        }
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute,
                                                 w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs[
                        'begin'] = begin if begin is not None else ind
                    word_token_kwargs[
                        'end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute,
                                                           w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse
示例#26
0
    def test_word_init(self):
        t = Word(**self.basic)
        self.assertEqual(t.spelling, self.basic['spelling'])
        self.assertEqual(t.frequency, float(self.basic['frequency']))

        self.assertRaises(AttributeError, getattr, t, 'tier1')
示例#27
0
def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False,
                       call_back=None, stop_check=None):
    curr_word = list()
    annotations = {at:list() for at in annotation_types}
    spelling_name, transcription_name = None, None
    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for at in annotation_types:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        if all(isinstance(item, Annotation) for item in at._list):
            # it's a list of spellings, take each one and add it to the overall annotations list
            for item in at._list:
                if item.label:
                    annotations[at].append((item.label, None, None))

        elif all(type(item) == BaseAnnotation for item in at._list):
            #it's a list of transcriptions, with each segment as a BaseAnnotation
            for item in at._list:
                if item.begin is not None:
                    begin = item.begin
                if item.end is None:
                    curr_word.append(item)
                elif item.end is not None:
                    end = item.end
                    curr_word.append(item)
                    curr_word = Transcription(curr_word)
                    annotations[at].append((curr_word, begin, end))
                    curr_word = list()
        else:
            print(at._list)
            raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations")

    if support_corpus is not None:
        spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0]
        transcriptions = [key for key in annotations if key.name == 'Transcription'][0]
        for index, info in enumerate(spellings):
            spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None)
            try:
                transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription
            except KeyError:
                try:
                    no_punctuation = ''.join([x for x in spelling if not x in string.punctuation])
                    transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription
                except KeyError:
                    transcription = Transcription([symbol for symbol in spelling])
            annotations[transcriptions].append((transcription, index, index+1))


    discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)

    if 'spelling_name' not in discourse_kwargs:
        discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling')
    if 'transcription_name' not in discourse_kwargs:
        discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription')

    if stop_check is not None and stop_check():
        return
    if call_back is not None:
        cur += 1
        call_back(cur)

    discourse = Discourse(discourse_kwargs)

    if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]:
        # running text will not have a frequency attribute supplied by the user
        # textgrids are also unlikely to have this attribute
        discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency'))
        add_frequency = True
    else:
        add_frequency = False

    ind = 0
    limit = max([len(list(v)) for v in annotations.values()])
    for n in range(limit):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)

        word_kwargs = dict()
        for at in annotations:
            if at.token or at.ignored:
                continue
            else:
                try:
                    word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                    #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
                except IndexError:
                    word_kwargs[at.attribute.name] = (at.attribute, None)
        word = Word(**word_kwargs)
        try:
            word = discourse.lexicon.find(word.spelling)
            if add_frequency:
                word.frequency += 1
        except KeyError:
            discourse.lexicon.add_word(word)

        word_token_kwargs = dict()
        word_token_kwargs['word'] = word
        begin, end = None, None
        for at in annotations:
            if at.ignored:
                continue
            try:
                word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
            except IndexError:
                word_token_kwargs[at.attribute.name] = (at.attribute, None)
            #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0])
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = annotations[at][n][1]
                    end = annotations[at][n][2]
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0])
        word_token_kwargs['begin'] = begin if begin is not None else ind
        word_token_kwargs['end'] = end if end is not None else ind + 1
        word_token = WordToken(**word_token_kwargs)
        discourse.add_word(word_token)
        if any(a.token for a in annotations):
            word.wordtokens.append(word_token)
        ind += 1
    return discourse
示例#28
0
def load_corpus_csv(corpus_name, path, delimiter,
                    trans_delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    trans_delimiter : str
        Character to use for spliting transcriptions into segments
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(best_delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(best_delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus