def load_discourse_textgrid(corpus_name, path, annotation_types,
                            feature_system_path = None, support_corpus_path = None,
                            stop_check = None, call_back = None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """

    data = textgrid_to_data(corpus_name, path, annotation_types, call_back=call_back, stop_check=stop_check)
    #textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    if support_corpus_path is not None:
        if isinstance(support_corpus_path, Corpus):
            #the corpus is 'preloaded' if this function is called by load_directory_textgrid
            #otherwise the corpus has to be loaded once per file in a directory, which could be slow
            support = support_corpus_path
        else:
            #otherwise, it's a string representing a path to the corpus
            support = load_binary(support_corpus_path)
    else:
        support = None
    discourse = data_to_discourse2(corpus_name, wav_path,
                                   annotation_types=annotation_types, support_corpus = support,
                                   stop_check=stop_check, call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)
    return discourse
def load_directory_ilg(corpus_name, path, annotation_types,
                        feature_system_path = None,
                        stop_check = None, call_back = None):
    """
    Loads a directory of interlinear gloss text files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_ilg(name, os.path.join(root,filename),
                                    annotation_types, corpus.lexicon,
                                    None,
                                    stop_check, call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier)
    return corpus
def test_save(export_test_dir, unspecified_test_corpus):
    save_path = os.path.join(export_test_dir, 'testsave.corpus')
    save_binary(unspecified_test_corpus,save_path)

    c = load_binary(save_path)

    assert(unspecified_test_corpus == c)
Exemplo n.º 4
0
def test_save(export_test_dir, unspecified_test_corpus):
    save_path = os.path.join(export_test_dir, 'testsave.corpus')
    save_binary(unspecified_test_corpus, save_path)

    c = load_binary(save_path)

    assert (unspecified_test_corpus == c)
Exemplo n.º 5
0
def load_directory_ilg(corpus_name, path, annotation_types,
                        feature_system_path = None,
                        stop_check = None, call_back = None):
    """
    Loads a directory of interlinear gloss text files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_ilg(name, os.path.join(root,filename),
                                    annotation_types, corpus.lexicon,
                                    None,
                                    stop_check, call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
    return corpus
Exemplo n.º 6
0
def load_discourse_textgrid(corpus_name,
                            path,
                            annotation_types,
                            feature_system_path=None,
                            stop_check=None,
                            call_back=None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """

    data = textgrid_to_data(corpus_name,
                            path,
                            annotation_types,
                            call_back=call_back,
                            stop_check=stop_check)
    #textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    discourse = data_to_discourse2(corpus_name,
                                   wav_path,
                                   annotation_types,
                                   stop_check=stop_check,
                                   call_back=call_back)

    # discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check)
    # discourse is a Discourse object, see corpus\classes\spontaneous.py
    if discourse is None:
        return
    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse
Exemplo n.º 7
0
def load_discourse_ilg(corpus_name,
                       path,
                       annotation_types,
                       lexicon=None,
                       feature_system_path=None,
                       stop_check=None,
                       call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    data = ilg_to_data(corpus_name, path, annotation_types, stop_check,
                       call_back)
    #discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check)
    discourse = data_to_discourse2(corpus_name=corpus_name,
                                   annotation_types=annotation_types,
                                   stop_check=stop_check,
                                   call_back=call_back)

    if discourse is None:
        return

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse
Exemplo n.º 8
0
def load_discourse_multiple_files(corpus_name,
                                  word_path,
                                  phone_path,
                                  dialect,
                                  annotation_types=None,
                                  lexicon=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        One of 'buckeye' or 'timit'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    data = multiple_files_to_data(word_path, phone_path, dialect,
                                  annotation_types, call_back, stop_check)
    data.name = corpus_name
    data.wav_path = find_wav_path(word_path)
    discourse = data_to_discourse(data, lexicon)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
    return discourse
Exemplo n.º 9
0
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect,
                                    annotation_types = None,
                                    lexicon = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        One of 'buckeye' or 'timit'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    data = multiple_files_to_data(word_path,phone_path, dialect,
                                    annotation_types,
                                    call_back, stop_check)
    data.name = corpus_name
    data.wav_path = find_wav_path(word_path)
    discourse = data_to_discourse(data, lexicon)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
    return discourse
Exemplo n.º 10
0
def load_discourse_ilg(corpus_name, path, annotation_types,
                    lexicon = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    data = ilg_to_data(corpus_name, path, annotation_types,stop_check, call_back)
    #discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check)
    discourse = data_to_discourse2(corpus_name=corpus_name, annotation_types=annotation_types,
                                   stop_check=stop_check, call_back=call_back)

    if discourse is None:
        return

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)

    return discourse
Exemplo n.º 11
0
def load_discourse_textgrid(corpus_name,
                            path,
                            annotation_types,
                            lexicon=None,
                            feature_system_path=None,
                            stop_check=None,
                            call_back=None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """
    data = textgrid_to_data(path, annotation_types, call_back, stop_check)
    data.name = corpus_name
    data.wav_path = find_wav_path(path)
    discourse = data_to_discourse(data, lexicon)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
    return discourse
Exemplo n.º 12
0
def load_discourse_textgrid(corpus_name, path, annotation_types,
                            lexicon = None,
                            feature_system_path = None,
                            stop_check = None, call_back = None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """
    data = textgrid_to_data(path, annotation_types, call_back, stop_check)
    data.name = corpus_name
    data.wav_path = find_wav_path(path)
    discourse = data_to_discourse(data, lexicon)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
    return discourse
Exemplo n.º 13
0
def load_discourse_multiple_files(corpus_name,
                                  word_path,
                                  phone_path,
                                  dialect,
                                  annotation_types=None,
                                  lexicon=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {
        'name': name,
        'wav_path': find_wav_path(word_path),
        'other_attributes': list()
    }
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute  #.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs[
                'transcription_name'] = at.attribute  #.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in (
                'tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {
            at.output_name: (at.attribute, w[at.output_name])
            for at in annotation_types
        }
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute,
                                                 w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs[
                        'begin'] = begin if begin is not None else ind
                    word_token_kwargs[
                        'end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute,
                                                           w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect,
                                    annotation_types = None,
                                    lexicon = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute#.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute#.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types}
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)

    return discourse
Exemplo n.º 15
0
def load_discourse_textgrid(corpus_name,
                            path,
                            annotation_types,
                            feature_system_path=None,
                            support_corpus_path=None,
                            stop_check=None,
                            call_back=None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """

    data = textgrid_to_data(corpus_name,
                            path,
                            annotation_types,
                            call_back=call_back,
                            stop_check=stop_check)
    #textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    if support_corpus_path is not None:
        if isinstance(support_corpus_path, Corpus):
            #the corpus is 'preloaded' if this function is called by load_directory_textgrid
            #otherwise the corpus has to be loaded once per file in a directory, which could be slow
            support = support_corpus_path
        else:
            #otherwise, it's a string representing a path to the corpus
            support = load_binary(support_corpus_path)
    else:
        support = None
    discourse = data_to_discourse2(corpus_name,
                                   wav_path,
                                   annotation_types=annotation_types,
                                   support_corpus=support,
                                   stop_check=stop_check,
                                   call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)
    return discourse
Exemplo n.º 16
0
def load_directory_multiple_files(corpus_name, path, dialect,
                                    annotation_types = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Loads a directory of corpus standard files (separated into words files
    and phones files)

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    dialect : str
        One of 'buckeye' or 'timit'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name,ext = os.path.splitext(filename)
        if ext == '.words':
            phone_ext = '.phones'
        else:
            phone_ext = '.phn'
        word_path = os.path.join(root,filename)
        phone_path = os.path.splitext(word_path)[0] + phone_ext
        d = load_discourse_multiple_files(name, word_path, phone_path,
                                            dialect, annotation_types,
                                            corpus.lexicon, None,
                                            stop_check, None)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
    return corpus
Exemplo n.º 17
0
def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None,
                               stop_check=None, call_back=None):

    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        feature_matrix = modernize.modernize_specifier(feature_matrix)

    if annotation_types is None:
        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)

    for a in annotation_types:
        a.reset()

    missing = set()

    with open(path, encoding='utf-8-sig') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers) == 1:
            e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches '
                                'the one used in the file.'))
            raise e
        headers = annotation_types

        for line in f.readlines():
            line = line.strip()
            if not line:
                continue

            for k, v in zip(headers, line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    ignored = k.ignored_characters
                    if ignored is not None:
                        v = ''.join(x for x in v if x not in ignored)

                    sd = k.syllable_delimiter
                    if sd is not None:
                        syllables = v.split(sd)
                    else:
                        syllables = [v]

                    td = k.trans_delimiter
                    stress_spec = set(k.stress_specification.keys())
                    tone_spec = set(k.tone_specification.keys())
                    supra_spec = stress_spec.union(tone_spec)
                    for syllable in syllables:
                        syllable = ''.join(x for x in syllable if x not in supra_spec)

                        if td is None:
                            if k.digraph_pattern is not None:
                                string = k.digraph_pattern.findall(syllable)
                            else:
                                string = [x for x in syllable]
                        else:
                            string = syllable.split(td)

                        for seg in string:
                            if seg == '':
                                continue

                            if seg not in feature_matrix.segments:
                                missing.add(seg)

    print('In csv.py', missing)
Exemplo n.º 18
0
def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path,
                               stop_check, call_back)

    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        feature_matrix = modernize.modernize_specifier(feature_matrix)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)

    for a in annotation_types:
        a.reset()

    if call_back is not None:
        call_back('Loading...')
        call_back(0, 0)
        cur = 0

    with open(path, encoding='utf-8-sig') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers) == 1:
            e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches '
                                'the one used in the file.'))
            raise e
        headers = annotation_types

        for a in headers:
            corpus.add_attribute(a.attribute)

        trans_check = True

        for line in f.readlines():
            if stop_check is not None and stop_check():
                return
            if call_back is not None:
                cur += 1
                call_back(cur)

            line = line.strip()
            if not line:  # blank or just a newline
                continue

            d = {}
            for k, v in zip(headers, line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus)  # trans is a list of BaseAnnotation
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)

            if word.transcription:
                #transcriptions can have phonetic symbol delimiters
                if not word.spelling:
                    word.spelling = ''.join(map(str, word.transcription))

            corpus.add_word(word, allow_duplicates=True)

    if corpus.specifier is not None:
        corpus.inventory.update_features(corpus.specifier)

    if corpus.has_transcription and any(len(word.transcription) > 1 for word in corpus):
        if not trans_check:
            e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\nCheck that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
            raise e

    if stop_check is not None and stop_check():
        return

    return corpus
Exemplo n.º 19
0
def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, _ = inspect_csv(path, coldelim = delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus
Exemplo n.º 20
0
def load_corpus_csv(corpus_name, path, delimiter,
                    trans_delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    trans_delimiter : str
        Character to use for spliting transcriptions into segments
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(best_delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(best_delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus
Exemplo n.º 21
0
def load_directory_multiple_files(corpus_name,
                                  path,
                                  dialect,
                                  annotation_types=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Loads a directory of corpus standard files (separated into words files
    and phones files)

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    dialect : str
        Currently only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []

    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not (filename.lower().endswith('.words')
                    or filename.lower().endswith('.wrd')):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0, len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(
                i + 1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name, ext = os.path.splitext(filename)
        if ext == '.words':
            phone_ext = '.phones'
        word_path = os.path.join(root, filename)
        phone_path = os.path.splitext(word_path)[0] + phone_ext
        try:
            d = load_discourse_multiple_files(name, word_path, phone_path,
                                              dialect, annotation_types,
                                              corpus.lexicon,
                                              feature_system_path, stop_check,
                                              None)
            corpus.add_discourse(d)
        except ValueError:
            print('Error importing for participant ' + name)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(
            corpus.lexicon.specifier)

    return corpus
Exemplo n.º 22
0
def main():

    #### Parse command-line arguments
    parser = argparse.ArgumentParser(description = \
             'Phonological CorpusTools: neighborhood density CL interface')
    parser.add_argument('corpus_file_name', help='Name of corpus file')
    parser.add_argument('query', help='Word to query, or name of file including a list of words')
    parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.")
    parser.add_argument('-a', '--algorithm', default= 'edit_distance', help="The algorithm used to determine distance")
    parser.add_argument('-d', '--max_distance', type=int, default = 1, help="Maximum edit distance from the queried word to consider a word a neighbor.")
    parser.add_argument('-s', '--sequence_type', default = 'transcription', help="The name of the tier on which to calculate distance")
    parser.add_argument('-w', '--count_what', default ='type', help="If 'type', count neighbors in terms of their type frequency. If 'token', count neighbors in terms of their token frequency.")
    parser.add_argument('-e', '--trans_delimiter', default='', help="If not empty string, splits the query by this str to make a transcription/spelling list for the query's Word object.")
    parser.add_argument('-m', '--find_mutation_minpairs', action='store_true', help='This flag causes the script not to calculate neighborhood density, but rather to find minimal pairs---see documentation.')
    parser.add_argument('-q', '--force_quadratic_algorithm', action='store_true', help='This flag prevents PCT from using the more efficient linear-time algorithm for edit distance of 1 neighborhoods.')
    parser.add_argument('-o', '--outfile', help='Name of output file')

    args = parser.parse_args()

    ####

    try:
        home = os.path.expanduser('~')
        corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name))
    except FileNotFoundError:
        corpus = load_binary(args.corpus_file_name)
        
    if args.context_type == 'Canonical':
        corpus = CanonicalVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)
    elif args.context_type == 'MostFrequent':
        corpus = MostFrequentVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)
    elif args.context_type == 'SeparatedTokens':
        corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)
    elif args.context_type == 'Weighted':
        corpus = WeightedVariantContext(corpus, args.sequence_type, type_or_token=args.count_what)

    if args.find_mutation_minpairs:
        query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter)
        matches = find_mutation_minpairs(corpus, query)
        for match in matches[1]:
            print(match)
        print('Total number of matches: {}'.format(str(matches[0])))
    else:
        try: # read query as a file name
            with open(args.query) as queryfile:
                queries = [line[0] for line in csv.reader(queryfile, delimiter='\t') if len(line) > 0]
                queries = [ensure_query_is_word(q, corpus, args.sequence_type, args.trans_delimiter) for q in queries]
            results = [neighborhood_density(corpus, q, algorithm = args.algorithm, max_distance = args.max_distance,
                                            force_quadratic=args.force_quadratic_algorithm) for q in queries]
            if args.outfile:
                with open(args.outfile, 'w') as outfile:
                    for q, r in zip(queries, results):
                        outfile.write('{}\t{}'.format(q, str(r[0])) + ''.join(['\t{}'.format(str(n)) for n in r[1]]) + '\n')
            else:
                raise Exception('In order to use a file of queries as input, you must provide an output file name using the option -o.')


        except FileNotFoundError: # read query as a single word
            query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter)
            result = neighborhood_density(corpus, query, algorithm = args.algorithm, max_distance = args.max_distance,
                                          force_quadratic=args.force_quadratic_algorithm)

            if args.outfile:
                with open(args.outfile, 'w') as outfile:
                    outfile.write('{}\t{}'.format(query, str(result[0])) + ''.join(['\t{}'.format(str(n)) for n in result[1]]))
            else:
                print('No output file name provided.')
                print('The neighborhood density of the given form is {}. For a list of neighbors, please provide an output file name.'.format(str(result[0])))