def main():

    parser = argparse.ArgumentParser(
        description='Process arguments to create Celex dictionary.')

    parser.add_argument(
        '-C',
        '--Celex_dir',
        required=True,
        dest='celex_dir',
        help='Specify the directory containing the CELEX files.')
    parser.add_argument(
        '-r',
        '--reduced',
        action='store_true',
        dest='reduced',
        help='Specify if the function considers reduced phonological forms.')

    args = parser.parse_args()

    if not os.path.isdir(args.celex_dir):
        raise ValueError(
            "The folder you provided does not exist. Please, provide the path to an existing folder."
        )
    else:
        get_celex_dictionary(args.celex_dir, reduced=args.reduced)
Exemplo n.º 2
0
def get_cue_and_outcome_measures(associations,
                                 row_ids,
                                 col_ids,
                                 celex_dir,
                                 plot_path,
                                 uniphones,
                                 diphones,
                                 triphones,
                                 syllable,
                                 stress_marker,
                                 reduced=False):
    """
    :param associations:
    :param row_ids:
    :param col_ids:
    :param celex_dir:
    :param plot_path:
    :param uniphones:
    :param diphones:
    :param triphones:
    :param syllable:
    :param stress_marker:
    :return:
    """

    celex_dict = get_celex_dictionary(celex_dir, reduced)
    jaccard_values = jaccard(associations,
                             row_ids,
                             col_ids,
                             celex_dict,
                             plots_folder=plot_path,
                             stress_marker=stress_marker,
                             uniphone=uniphones,
                             diphone=diphones,
                             triphone=triphones,
                             syllable=syllable)
    outcome_values = outcome_measures(associations, col_ids, plot_path)
    cue_values = cue_measures(associations, row_ids, col_ids, plot_path)

    return cue_values, outcome_values, jaccard_values
def make_test_set(corpus_file, celex_dir, pos_mapping, output_file,
                  ambiguous=False, new=False, reduced=False, stress_marker=False):

    """
    :param corpus_file:     the path to a .json file consisting of two lists, which in turn consists of several lists.
                            Each inner list contains strings
    :param celex_dir:       the path to the Celex directory
    :param pos_mapping:     a .txt file mapping CHILDES PoS tags to CELEX tags
    :param output_file:     the file where the test items will be printed
    :param ambiguous:       if True, words are returned that are tagged with multiple PoS tags in CELEX and all have
                            the same pronunciation
    :param new:             if True, words are returned that are listed in the CELEX dictionary but never appear in the
                            input corpus
    :param reduced:         a boolean specifying whether reduced phonological forms should be extracted from Celex
                            whenever possible (if set to True) or if standard phonological forms should be preserved
                            (if False)
    :param stress_marker:   a boolean indicating whether stress information from the test items should be preserved or
                            not. It is assumed that test items are all encoded for stress: setting this argument to
                            False causes the algorithm to discard the stress information. Secondary stress, marked with
                            ("), is always deleted. It is assumed that stress is encoded as (')
    :return test_set:       a set containing tuples (orthographic form, phonological form): orthographic forms - and
                            corresponding phonological forms - are added based on the input arguments ambiguous and new
    """

    corpus = json.load(open(corpus_file, 'r+'))
    celex_dict = get_celex_dictionary(celex_dir, reduced=reduced)
    pos_tags = get_pos_mapping(pos_mapping)

    corpus_words = get_words_from_corpus(corpus)
    filtered_words = align_tag_set(corpus_words, pos_tags)
    celex_lemmas = lemmas2phon(celex_dict)
    lemmas_with_pos = {'|'.join([k, p]) for k in celex_lemmas for p in celex_lemmas[k]}

    if new:
        # only look at words that are in CELEX but not in the corpus, without considering PoS tags
        candidates = lemmas_with_pos - (lemmas_with_pos.intersection(filtered_words))
    else:
        # only look at words that are both in CELEX and in the corpus, without considering PoS tags
        candidates = lemmas_with_pos.intersection(filtered_words)

    ambiguous_words = set()
    unambiguous_words = set()

    candidates_copy = {el.split('|')[0] for el in candidates}
    total_words = len(candidates_copy) - 1
    check_points = {int(np.floor(total_words / 100 * n)): n for n in np.linspace(20, 100, 5)}

    for idx, word in enumerate(candidates_copy):
        pos_tags = set(celex_lemmas[word].keys())
        if len(pos_tags) > 1:
            phonetic_forms = defaultdict(set)
            for tag in pos_tags:
                # when the same orthographic form corresponds to two different phonetic forms (that differ depending on
                # the PoS tag), this dictionary has multiple keys, each paired to the PoS tag matching the phonetic form
                phonetic_forms[celex_lemmas[word][tag]].add(tag)

            if len(set(phonetic_forms.keys())) == 1:
                # if the set of phonetic forms has cardinality 1, then the word is pronounced the same regardless of the
                # PoS tag; add the word with all its PoS tags to the set of ambiguous words
                for tag in pos_tags:
                    test_item = derive_orthography_and_phonology(word, tag, celex_lemmas)
                    ambiguous_words.add(test_item)

            else:
                # if the cardinality the set of phonetic forms is not 1, than there are pronunciation differences for
                # the same orthographic form depending on the PoS tag; if this is the case, find possible pairs that are
                # ambiguous and the forms that are unambiguous, and add each to the test set according to the parameter
                # choice
                if len(pos_tags) == len(set(phonetic_forms.keys())):
                    # if there are as many different phonetic forms as there are PoS tags, it means that there is no
                    # ambiguity once the PoS tag is considered and that the phonetic form of a words maps uniquely to
                    # a given combination of orthographic form and PoS tag; thus, add each word-PoS combination together
                    # with the corresponding phonological form to the set of unambiguous words
                    for tag in pos_tags:
                        test_item = derive_orthography_and_phonology(word, tag, celex_lemmas)
                        unambiguous_words.add(test_item)
                else:
                    # if there are fewer phonetic forms than PoS tags, than there is some ambiguity, meaning that two or
                    # more combinations of orthographic form and PoS tag are pronounced the same, making them
                    # indistinguishable from a purely phonetic perspective. There might be unambiguous forms, but it is
                    # not guaranteed as there could be pairs of ambiguous orthographic forms: phonetic forms that pair
                    # to more than one PoS tag are ambiguous, those that pair to a single PoS tag are not.
                    for form, pos_tags in phonetic_forms.items():
                        if len(pos_tags) == 1:
                            test_item = derive_orthography_and_phonology(word, list(pos_tags)[0], celex_lemmas)
                            unambiguous_words.add(test_item)
                        else:
                            for tag in pos_tags:
                                test_item = derive_orthography_and_phonology(word, tag, celex_lemmas)
                                ambiguous_words.add(test_item)

        else:
            test_item = derive_orthography_and_phonology(word, list(pos_tags)[0], celex_lemmas)
            unambiguous_words.add(test_item)

        if idx in check_points:
            print(strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of target words have been evaluated." % check_points[idx])

    test_set = set()
    words_to_evaluate = ambiguous_words if ambiguous else unambiguous_words
    print()
    print(strftime("%Y-%m-%d %H:%M:%S") + ": Started writing to file...")
    with open(output_file, "a+") as o:
        for word in words_to_evaluate:
            if word[0] in candidates:
                phon = word[1].replace("\"", "")
                if not stress_marker:
                    phon.replace("'", "")
                test_set.add(phon)
                o.write("\t".join([word[0], phon]))
                o.write("\n")
    print(strftime("%Y-%m-%d %H:%M:%S") + ": ...finished writing to file.")

    return test_set
def corpus_encoder(corpus_name,
                   celex_dir,
                   pos_mapping,
                   separator='~',
                   reduced=True,
                   outcomes='tokens',
                   uniphones=True,
                   diphones=False,
                   triphones=False,
                   syllables=False,
                   stress_marker=False,
                   boundaries=False):
    """
    :param corpus_name:     the path to a .json file containing transcripts of child-caregiver interactions extracted
                            from the CHILDES database. The json file consists of two lists of lists, of the same length,
                            both contain words but encoded differently. The first is a list of tokens, i.e. surface
                            forms as they appear in the transcriptions; the second is a list of lemmas, i.e. words in
                            their dictionary form, without any inflectional morphology, together with their
                            Part-of-Speech tags, joined by a specific character (which can be specified with the
                            parameter 'separator'.
    :param celex_dir:       a string indicating the path to the Celex directory containing files with phonological and
                            morphological information for words and lemmas ('epw.cd', 'epl.cd', 'emw.cd', eml.cd').
                            The function also checks whether this directory already contains the Celex dictionary: if
                            it is found, the dictionary is loaded and the function proceeds, otherwise the dictionary
                            is created.
    :param pos_mapping:     the path to a .txt file indicating the mapping between CHILDES and Celex PoS tags; it must
                            consist of two space-separated columns.
    :param separator:       the character that separates the word baseform from its PoS tag in the input corpus
    :param reduced:         a boolean specifying whether reduced phonological forms should be extracted from Celex
                            whenever possible (if set to True) or if standard phonological forms should be preserved
                            (if False)
    :param outcomes:        a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas'
    :param uniphones:       a boolean indicating whether single phonemes are to be considered while encoding input
                            utterances
    :param diphones:        a boolean indicating whether sequences of two phonemes are to be considered while
                            encoding input utterances
    :param triphones:       a boolean indicating whether sequences of three phonemes are to be considered while
                            encoding input utterances
    :param syllables:       a boolean indicating whether syllables are to be considered while encoding input
                            utterances
    :param stress_marker:   a boolean indicating whether stress markers from the phonological representations of Celex
                            need to be preserved or can be discarded
    :param boundaries:      a boolean indicating whether to preserve or discard word boundaries
    :return out_file:   	the path to the file where the encoded version of the input file has been printed

    This function runs in linear time on the length of the input (if it takes 1 minute to process 1k utterances,
    it takes 2 minutes to process 2k utterances). It processes ~550k utterances in ~10 second on a 2x Intel Xeon 6-Core
    E5-2603v3 with 2x6 cores and 2x128 Gb of RAM.
    """

    # get the Celex dictionary; create a dictionary where token surface forms are keys, and values
    # are sets containing all the token IDs that match a given surface form; the get the vowel symbols from Celex and
    # the mapping from inflectional morphology codes to their meanings
    celex_dict = get_celex_dictionary(celex_dir, reduced=reduced)
    tokens2identifiers = tokens2ids(celex_dict)
    pos_dict = get_pos_mapping(pos_mapping)

    # use the path of the input file to generate the path of the output file, adding encoding information to the
    # input filename; print to standard output a summary of all the encoding parameters
    input_filename, extension = os.path.splitext(corpus_name)
    encoding_string = encoding_features(corpus_name,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        stress_marker=stress_marker,
                                        outcomes=outcomes,
                                        boundaries=boundaries)
    output_folder = "_".join([input_filename, encoding_string])
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_file = os.path.join(
        output_folder, ".".join([output_folder.split('/')[-1], 'json']))

    # check whether the output file corresponding to the desired parameters already exist and stop if it does
    if os.path.isfile(output_file):
        print()
        print(
            "The desired encoded version of the input corpus '%s' already exists at file '%s'."
            % (os.path.basename(corpus_name), os.path.basename(output_file)))
        return output_file
    else:

        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Started encoding utterances from input corpus '%s'" %
            corpus_name)

        # get the corpus recoded into phonological cues and lexical outcomes, plus the percentage of utterances that
        # could not be recoded because one or more words do not have a corresponding entry in Celex
        encoded_corpus, missed = encode_corpus(corpus_name,
                                               celex_dict,
                                               tokens2identifiers,
                                               pos_dict,
                                               separator=separator,
                                               uniphones=uniphones,
                                               diphones=diphones,
                                               triphones=triphones,
                                               syllables=syllables,
                                               stress_marker=stress_marker,
                                               outcomes=outcomes,
                                               boundaries=boundaries)
        print()
        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Finished encoding utterances from input corpus '%s'" %
            corpus_name)
        print()

        perc_covered = 100 - missed
        print()
        if os.path.exists(output_file):
            print("The file %s already exists." % output_file)
        else:
            json.dump(encoded_corpus, open(output_file, 'w'))
            print("The file %s has been created:" % output_file)
            print()
            print("%0.4f%% of the utterances could be entirely encoded." %
                  perc_covered)
            print(
                "The remaining %0.4f%% contain at least one word that could not be retrieved in CELEX and "
                % missed)
            print(
                "for which no phonological and morphological representation could be obtained."
            )

    return output_file
def main():

    parser = argparse.ArgumentParser(
        description=
        "Compute the variance of each phonological cue and token, as a proxy "
        "to identify the amount of information they carry")

    parser.add_argument(
        "-c",
        "--corpus",
        required=True,
        dest="corpus",
        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument(
        "-C",
        "--celex_folder",
        required=True,
        dest="celex_folder",
        help="Specify the folder where the Celex data are located.")
    parser.add_argument(
        "-O",
        "--output_folder",
        required=True,
        dest="output_folder",
        help=
        "Specify the path of the folder where the logfiles will be stored together with"
        "the summary tables.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags."
    )
    parser.add_argument(
        "-p",
        "--precision",
        dest="precision",
        default=5,
        help=
        "Specify the number of outcomes to consider when computing discrimination's precision."
    )
    parser.add_argument(
        "-l",
        "--longitudinal",
        action="store_true",
        dest="longitudinal",
        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()

    cues = ['triphones']
    outcomes = ['tokens']
    stress_marker = [True]
    boundaries = [True]
    reduced_vowels = [False]
    number_of_cues = [100, 1000]

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    cues_variance_file = os.path.join(args.output_folder,
                                      "words_cues_variance.csv")
    tokens_variance_file = os.path.join(args.output_folder,
                                        "words_tokens_variance.csv")

    parametrizations = it.product(cues, outcomes, stress_marker, boundaries,
                                  reduced_vowels)

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]

    cues_table = pd.DataFrame(
        index=[],
        columns=[
            "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision",
            "Time", "Phonological_cue", "Variance", "Frequency",
            "Lexical_diversity", "Phonological_diversity",
            "Cue|Cues_predictability", "Cue|Tokens_predictability",
            "Cues|Cue_predictability", "Tokens|Cue_predictability"
        ])

    tokens_table = pd.DataFrame(
        index=[],
        columns=[
            "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision",
            "Time", "numCues", 'Token', "Variance", "Frequency",
            "Lexical_diversity", "Phonological_diversity",
            "Token|Tokens_predictability", "Token|Cues_predictability",
            "Tokens|Token_predictability", "Cues|Token_predictability"
        ])

    ii = 0
    jj = 0

    for parametrization in parametrizations:

        print(parametrization)

        cue_type, outcome, stress, boundary, reduced = parametrization

        uniphones = True if cue_type == 'uniphones' else False
        diphones = True if cue_type == 'diphones' else False
        triphones = True if cue_type == 'triphones' else False
        syllables = True if cue_type == 'syllables' else False
        vowels = 'reduced' if reduced else 'full'
        sm = "stress" if stress else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]

        encoded_corpus = corpus_encoder(args.corpus,
                                        args.celex_folder,
                                        args.pos_mapping,
                                        separator='~',
                                        stress_marker=stress,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        outcomes=outcome,
                                        boundaries=boundary)

        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Started computing distributional statistics from the corpus...")
        token_statistics, cue_statistics = usf.compute_distributional_predictors(
            encoded_corpus, time_points)
        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": ...finished computing distributional statistics from the corpus."
        )
        print()

        corpus_dir = os.path.dirname(encoded_corpus)

        a, b = [0.001, 0.001
                ] if training == 'aggregate_utterances' else [0.01, 0.01]
        file_paths = ndl(encoded_corpus,
                         alpha=a,
                         beta=b,
                         lam=1,
                         longitudinal=args.longitudinal)

        celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced)

        for idx, file_path in file_paths.items():

            idx = int(idx)
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir, '.'.join([
                    'discriminatedOutcomes',
                    str(idx), ''.join(['at', args.precision]), 'json'
                ]))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=int(args.precision))
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": The discriminated outcomes have been identified (file: %s)."
                % discriminated_file)
            print()

            row_variances, matrix, discriminated = usf.get_cue_variances(
                matrix, discriminated)
            cue_variances = {}
            for cue in cues2ids:
                cue_idx = cues2ids[cue]
                cue_variances[cue] = row_variances[cue_idx]

            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Started storing cue variances...")
            for cue in cue_variances:
                if len(cue_statistics[cue]['freq']) == 10:

                    frequency = cue_statistics[cue]['freq'][idx]
                    lexical_diversity = cue_statistics[cue]['lexdiv'][idx]
                    phonological_diversity = cue_statistics[cue]['phondiv'][
                        idx]

                    # average conditional probability of a cue given the co-occurring cues
                    cue_cues_predictability = cue_statistics[cue][
                        'p_cue_cues'][idx]

                    # average predictive power of a cue with respect to all the co-occurring cues
                    cues_cue_predictability = cue_statistics[cue][
                        'p_cues_cue'][idx]

                    # average conditional probability of a cue given the co-occurring tokens
                    cue_tokens_predictability = cue_statistics[cue][
                        'p_cue_tokens'][idx]

                    # average predictive power of a cue with respect to all the co-occurring tokens
                    tokens_cue_predictability = cue_statistics[cue][
                        'p_tokens_cue'][idx]

                    cues_table.loc[ii] = pd.Series({
                        "Corpus":
                        training,
                        "Cues":
                        cue_type,
                        "Outcomes":
                        outcome,
                        "Stress":
                        sm,
                        "Boundaries":
                        bound,
                        "Vowels":
                        vowels,
                        "Time":
                        idx,
                        "Precision":
                        int(args.precision),
                        "Phonological_cue":
                        cue,
                        "Variance":
                        cue_variances[cue],
                        "Frequency":
                        frequency,
                        "Lexical_diversity":
                        lexical_diversity,
                        "Phonological_diversity":
                        phonological_diversity,
                        "Cue|Cues_predictability":
                        cue_cues_predictability,
                        "Cues|Cue_predictability":
                        cues_cue_predictability,
                        "Cue|Tokens_predictability":
                        cue_tokens_predictability,
                        "Tokens|Cue_predictability":
                        tokens_cue_predictability
                    })
                ii += 1

            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": ...finished storing cue variances.")
            print()

            for how_many_cues in number_of_cues:

                print("Number of cues: ", how_many_cues)

                token_variances = usf.get_token_variances(
                    matrix,
                    discriminated,
                    row_variances,
                    how_many_cues=how_many_cues)

                print(
                    strftime("%Y-%m-%d %H:%M:%S") +
                    ": Started storing token variances...")
                for token in token_variances:
                    if len(token_statistics[token]['freq']) == 10:

                        frequency = token_statistics[token]['freq'][idx]
                        lexical_diversity = token_statistics[token]['lexdiv'][
                            idx]
                        phonological_diversity = token_statistics[token][
                            'phondiv'][idx]

                        # average conditional probability of a token given the co-occurring tokens
                        token_tokens_predictability = token_statistics[token][
                            'p_token_tokens'][idx]

                        # average predictive power of a token with respect to the co-occurring tokens
                        tokens_token_predictability = token_statistics[token][
                            'p_tokens_token'][idx]

                        # average conditional probability of a token given the co-occurring phonological cues
                        token_cues_predictability = token_statistics[token][
                            'p_token_cues'][idx]

                        # average predictive power of a token with respect to the co-occurring phonological cues
                        cues_token_predictability = token_statistics[token][
                            'p_cues_token'][idx]

                        tokens_table.loc[jj] = pd.Series({
                            "Corpus":
                            training,
                            "Cues":
                            cue_type,
                            "Outcomes":
                            outcome,
                            "Stress":
                            sm,
                            "Boundaries":
                            bound,
                            "Vowels":
                            vowels,
                            "Time":
                            idx,
                            "numCues":
                            how_many_cues,
                            "Precision":
                            int(args.precision),
                            "Token":
                            token,
                            "Variance":
                            token_variances[token],
                            "Frequency":
                            frequency,
                            "Lexical_diversity":
                            lexical_diversity,
                            "Phonological_diversity":
                            phonological_diversity,
                            "Token|Tokens_predictability":
                            token_tokens_predictability,
                            "Tokens|Token_predictability":
                            tokens_token_predictability,
                            "Token|Cues_predictability":
                            token_cues_predictability,
                            "Cues|Token_predictability":
                            cues_token_predictability
                        })
                    jj += 1
                print(
                    strftime("%Y-%m-%d %H:%M:%S") +
                    ": ...finished storing token variances.")

                print()
                print('-' * 100)
                print()

            print()
            print()
            print('=' * 100)
            print('=' * 100)
            print()
            print()

        print()
        print()
        print()
        print('#' * 100)
        print('#' * 100)
        print('#' * 100)
        print()
        print()
        print()

    if os.path.exists(cues_variance_file):
        cues_table.to_csv(cues_variance_file,
                          sep='\t',
                          index=False,
                          mode="a",
                          header=False)
    else:
        cues_table.to_csv(cues_variance_file, sep='\t', index=False)

    if os.path.exists(tokens_variance_file):
        tokens_table.to_csv(tokens_variance_file,
                            sep='\t',
                            index=False,
                            mode="a",
                            header=False)
    else:
        tokens_table.to_csv(tokens_variance_file, sep='\t', index=False)
Exemplo n.º 6
0
def map_phonology(corpus_file, mapping_file, output_file, celex_dir, compounds=True, reduced=False, minimalist=True):

    """
    :param corpus_file:         the path to a .txt file containing one utterance per line, with all words in the
                                utterance separated by a comma and each word being a tuple consisting of four
                                pipe-separated elements, token|lemma|PoS1|PoS2 where PoS1 is the coarse Celex tag and
                                PoS2 is the tag provided by the TreeTagger
    :param mapping_file:        the path to a .txt file where the output of the process will be written to
    :param output_file:         the path to a .txt file where the lines from the input will be rewritten as
                                comma-separated sequences of pipe-separated 5-tuples consisting of
                                token, lemma, pos1, pos2, 3phones
    :param celex_dir:           the directory where the Celex dictionary is to be found
    :param compounds:           a boolean. If true, all entries in Celex are considered; if False, entries which contain
                                spaces are discarded
    :param minimalist:          a boolean. It specifies whether lemmas in the output file should be differentiated when
                                their phonetic realization changes depending on the part of speech: if minimalist is
                                True, lemmas are not differentiated (default), if it is False, lemmas are differentiated
                                by appending pos1 to the lemma, separated by a colon
    :return mapping:            a dictionary mapping 4-tuples token|lemma|pos1|pos2 to the matching triphones
    """

    celex_dict = get_celex_dictionary(celex_dir, reduced=reduced, compounds=compounds)
    tokens2identifiers = tokens2ids(celex_dict)
    mapping = {}
    lemma2phon = defaultdict(dict)

    new_corpus = []

    with open(corpus_file, 'r') as fr:
        for line in fr:
            words = line.strip().split(',')
            new_line = []
            for word in words:
                if word:
                    try:
                        token, lemma, pos1, pos2 = word.split('|')
                    except ValueError:
                        token, lemma, pos1 = word.split('|')
                        pos2 = 'NN' if pos1 == 'N' else pos1

                    new_token, new_lemma = adjust_apostrophes(token, lemma)
                    new_token = new_token.replace('=', '_')
                    new_lemma = new_lemma.replace('=', '_')
                    token_phonological_form = get_phonetic_encoding([(new_token, pos1, new_lemma)],
                                                                    celex_dict, tokens2identifiers)
                    lemma_phonology = get_phonetic_encoding([(new_lemma, pos1, new_lemma)],
                                                            celex_dict, tokens2identifiers)
                    lemma_phonological_form = ''.join(lemma_phonology) if isinstance(lemma_phonology, list) else \
                        ''.join(token_phonological_form)

                    if isinstance(token_phonological_form, list):
                        triphones = encode_item(token_phonological_form[0], triphones=True, stress_marker=True,
                                                uniphones=False, diphones=False, syllables=False)
                        deriv = code_derivational_morphology(pos2)
                        output_token = token.replace('_', '=')
                        output_lemma = lemma.replace('_', '=')

                        morpho = 'COMPOUND' if '=' in output_token else 'MONO'
                        key = '|'.join([output_token, output_lemma, pos1, pos2, deriv, morpho,
                                        ':'.join([output_token, pos1])])
                        output_triphones = ';'.join(triphones)

                        mapping[key] = output_triphones
                        if lemma_phonological_form in lemma2phon[output_lemma]:
                            lemma2phon[output_lemma][lemma_phonological_form].add(pos1)
                        else:
                            lemma2phon[output_lemma][lemma_phonological_form] = {pos1}

                        new_line.append((output_token, ':'.join([output_lemma, pos1]), pos1, pos2, output_triphones))
            new_corpus.append(new_line)

    write_mapping_file(mapping, mapping_file)
    write_output_corpus(new_corpus, output_file, lemma2phon, minimalist=minimalist)

    return mapping
Exemplo n.º 7
0
def main():

    parser = argparse.ArgumentParser(
        description=
        "Assess whether words from the same category cluster together"
        "first considering their sound patterns and then how they correlate"
        "to each other based on their contexts of occurrence")

    parser.add_argument(
        "-c",
        "--corpus",
        required=True,
        dest="corpus",
        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument(
        "-C",
        "--celex_folder",
        required=True,
        dest="celex_folder",
        help="Specify the folder where the Celex data are located.")
    parser.add_argument(
        "-O",
        "--output_folder",
        required=True,
        dest="output_folder",
        help=
        "Specify the path of the folder where the logfiles will be stored together with"
        "the summary tables.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags."
    )
    parser.add_argument(
        "-p",
        "--precision",
        dest="precision",
        default=5,
        help=
        "Specify the number of outcomes to consider when computing discrimination's precision."
    )
    parser.add_argument(
        "--cue_threshold",
        dest="cue_threshold",
        default='high',
        help=
        "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant cues or"
        "a low (i.e. lax) one.")
    parser.add_argument(
        "--token_threshold",
        dest="token_threshold",
        default='low',
        help=
        "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant tokens or"
        "a low (i.e. lax) one.")
    parser.add_argument(
        "-l",
        "--longitudinal",
        action="store_true",
        dest="longitudinal",
        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()

    # thresholds have been determined manually according to the following criteria:
    # - high thresholds were set to yield around 100 dimensions at time t100
    # - low thresholds were set to yield around 100 dimensions at time t10
    # Whenever a threshold value didn't yield any dimension because the value was too stringent, the threshold was
    # lowered until at least 1 dimension was available at all time points. First, I adjusted the threshold on
    # phonological cues' variance, then on tokens' variance. Practically, it was always possible to set thresholds to
    # yield around 100 dimensions at the specified time points, except for tokens' variance in the low variance setting,
    # wheere the threshold yielding around 100 dimensions at t10 quickly left the model without any dimension.
    # Therefore, these models start with considerably high dimensionalities, and finishes with almost no dimensionans.

    thresholds = {
        'aggregate_utterances_at5_c_low':
        0.00000001,  # 99 cues at t10 (799 at t100)
        'aggregate_utterances_at5_c_high':
        0.00000075,  # 105 cues at t100 (2 cues at t10)
        'aggregate_utterances_at5_t_low':
        0.02,  # 741 tokens at t10 (3 tokens at t100)
        'aggregate_utterances_at5_t_high':
        0.04,  # 125 tokens at 100 (843 tokens at t10)
        'aggregate_utterances_at25_c_low':
        0.000000005,  # 98 cues at t10 (802 cues at t100)
        'aggregate_utterances_at25_c_high':
        0.00000025,  # 156 cues at t100 (2 cues at t10)
        'aggregate_utterances_at25_t_low':
        0.015,  # 1632 tokens at t10 (19 tokens at t100)
        'aggregate_utterances_at25_t_high':
        0.033,  # 140 tokens at t100 (1731 at t10)
        'aggregate_words_at5_c_low':
        0.000025,  # 101 cues at t10 (318 cues at t100)
        'aggregate_words_at5_c_high':
        0.00005,  # 131 cues at t100 (40 cues at t10)
        'aggregate_words_at5_t_low':
        0.0325,  # 834 tokens at t10 (3 tokens at t100)
        'aggregate_words_at5_t_high':
        0.07,  # 95 tokens at t100 (1004 tokens at t10)
        'aggregate_words_at25_c_low':
        0.0000075,  # 117 cues at t10 (416 cues at t100)
        'aggregate_words_at25_c_high':
        0.00002,  # 124 cues at t100 (33 cues at t10)
        'aggregate_words_at25_t_low':
        0.0295,  # 1850 tokens at t10 (7 tokens at t100)
        'aggregate_words_at25_t_high': 0.085  # 113 tokens at t100 (2581)
    }

    cues = ['triphones']
    outcomes = ['tokens']
    stress_marker = [True]
    boundaries = [True]
    reduced_vowels = [False]
    distances = ['correlation']

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    summary_file = os.path.join(args.output_folder, "LDAt_summary.csv")
    # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv")

    parametrizations = it.product(cues, outcomes, stress_marker, boundaries,
                                  reduced_vowels, distances)

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]
    rows = int(
        np.prod([
            len(cues),
            len(outcomes),
            len(stress_marker),
            len(reduced_vowels),
            len(boundaries),
            len(time_points),
            len(distances)
        ]))
    summary_table = pd.DataFrame(
        index=np.arange(0, rows),
        columns=[
            "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision",
            "Time", "Distance", "tCues", "numCues", "tTokens", "numTokens",
            "Phon_acc", "Phon_acc_subset", "Phon_baseline", "Distr_acc",
            "Distr_acc_subset", "Distr_baseline"
        ])

    ii = 0
    for parametrization in parametrizations:

        cue, outcome, stress, boundary, reduced, distance = parametrization

        uniphones = True if cue == 'uniphones' else False
        diphones = True if cue == 'diphones' else False
        triphones = True if cue == 'triphones' else False
        syllables = True if cue == 'syllables' else False
        vowels = 'reduced' if reduced else 'full'
        sm = "stress" if stress else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]

        encoded_corpus = corpus_encoder(args.corpus,
                                        args.celex_folder,
                                        args.pos_mapping,
                                        separator='~',
                                        stress_marker=stress,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        outcomes=outcome,
                                        boundaries=boundary)

        corpus_dir = os.path.dirname(encoded_corpus)

        # precision at 25
        a, b = [0.001, 0.001
                ] if training == 'aggregate_utterances' else [0.01, 0.01]
        c = thresholds['_'.join([
            training, ''.join(['at', args.precision]), 'c', args.cue_threshold
        ])]
        t = thresholds['_'.join([
            training, ''.join(['at', args.precision]), 't',
            args.token_threshold
        ])]

        file_paths = ndl(encoded_corpus,
                         alpha=a,
                         beta=b,
                         lam=1,
                         longitudinal=args.longitudinal)

        celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced)

        for idx, file_path in sorted(file_paths.items(),
                                     key=operator.itemgetter(0)):

            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir, '.'.join([
                    'discriminatedOutcomes',
                    str(int(idx)), ''.join(['at', args.precision]), 'json'
                ]))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=args.precision)
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                "The discriminated outcomes have been identified (file: %s)." %
                discriminated_file)

            accuracies = threshold_experiment(matrix,
                                              discriminated,
                                              cues_threshold=c,
                                              tokens_threshold=t)

            summary_table.loc[ii] = pd.Series({
                "Corpus": training,
                "Cues": cue,
                "Outcomes": outcome,
                "Stress": sm,
                "Boundaries": bound,
                "Vowels": vowels,
                "Time": int(idx),
                "Distance": distance,
                "Precision": args.precision,
                "tCues": args.cue_threshold,
                "numCues": accuracies[3],
                "tTokens": args.token_threshold,
                "numTokens": accuracies[7],
                "Phon_acc": accuracies[0],
                "Phon_acc_subset": accuracies[1],
                "Distr_acc": accuracies[4],
                "Distr_acc_subset": accuracies[5],
                "Phon_baseline": accuracies[2],
                "Distr_baseline": accuracies[6]
            })
            ii += 1

    if os.path.exists(summary_file):
        summary_table.to_csv(summary_file,
                             sep='\t',
                             index=False,
                             mode="a",
                             header=False)
    else:
        summary_table.to_csv(summary_file, sep='\t', index=False)
Exemplo n.º 8
0
def cluster_words(corpus,
                  output_folder,
                  celex_folder,
                  pos_mapping,
                  distance='cosine',
                  reduced=False,
                  outcomes='tokens',
                  uniphones=False,
                  diphones=False,
                  triphones=True,
                  syllables=False,
                  stress_marker=True,
                  boundaries=True,
                  at=5,
                  nn=25,
                  a=0.01,
                  b=0.01,
                  longitudinal=False):
    """
    :param corpus:          the corpus to be used for training the model
    :param output_folder:   the folder where the logfile of the clustering experiment will be saved
    :param celex_folder:    the folder containing the data from the Celex database
    :param pos_mapping:     the path to the file mapping CHILDES pos tags to Celex tags
    :param distance:        a string (either 'correlation' or 'cosine' indicating which distance metric to use
    :param reduced:         a boolean indicating whether to use reduced or full phonetic transcriptions from Celex
    :param outcomes:        a string (either 'tokens' or 'lemmas') indicating which outcomes to consider for learning
    :param uniphones:       a boolean indicating whether to consider uniphones as cues
    :param diphones:        a boolean indicating whether to consider diphones as cues
    :param triphones:       a boolean indicating whether to consider triphones as cues
    :param syllables:       a boolean indicating whether to consider syllables as cues
    :param stress_marker:   a boolean indicating whether to consider or discard stress information
    :param boundaries:      a boolean indicating whether to consider or discard word boundaries
    :param at:              an integer indicating how many outcomes to compute to compute discrimination's precision
    :param nn:              an integer indicating how many nearest neighbors to consider when evaluating clustering
    :param a:               the alpha parameter from the Rescorla Wagner model
    :param b:               the beta parameter from the Rescorla Wagner model
    :param longitudinal:    a boolean indicating whether to adopt a longitudinal design or not
    :return accuracies:     a dictionary mapping time indices to the clustering accuracy obtained at that time point
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    plot_folder = os.path.join(output_folder, 'plots')
    if not os.path.exists(plot_folder):
        os.makedirs(plot_folder)

    encoded_corpus = corpus_encoder(corpus,
                                    celex_folder,
                                    pos_mapping,
                                    separator='~',
                                    stress_marker=stress_marker,
                                    reduced=reduced,
                                    uniphones=uniphones,
                                    diphones=diphones,
                                    triphones=triphones,
                                    syllables=syllables,
                                    outcomes=outcomes,
                                    boundaries=boundaries)

    corpus_dir = os.path.dirname(encoded_corpus)

    file_paths = ndl(encoded_corpus,
                     alpha=a,
                     beta=b,
                     lam=1,
                     longitudinal=longitudinal)

    celex_dict = get_celex_dictionary(celex_folder, reduced=reduced)

    accuracies = {}
    for idx, file_path in file_paths.items():

        logfile = make_log_file(corpus,
                                output_folder,
                                'json',
                                dist=distance,
                                nn=nn,
                                at=at,
                                time=idx,
                                outcomes=outcomes,
                                reduced=reduced,
                                stress_marker=stress_marker,
                                boundaries=boundaries,
                                syllables=syllables,
                                uniphones=uniphones,
                                diphones=diphones,
                                triphones=triphones)
        plotfile = make_log_file(corpus,
                                 plot_folder,
                                 'pdf',
                                 dist=distance,
                                 nn=nn,
                                 at=at,
                                 time=idx,
                                 outcomes=outcomes,
                                 reduced=reduced,
                                 stress_marker=stress_marker,
                                 boundaries=boundaries,
                                 syllables=syllables,
                                 uniphones=uniphones,
                                 diphones=diphones,
                                 triphones=triphones)

        if os.path.exists(logfile):
            print()
            print(
                "The file %s already exists, statistics for the corresponding "
                "parametrization are loaded from it" % logfile)
            clusters = json.load(open(logfile, "r"))

        else:
            print()
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir,
                '.'.join(['discriminatedOutcomes',
                          str(int(idx)), 'json']))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=at)
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Start clustering, using %s as weight matrix..." %
                (os.path.basename(file_path)))
            if distance == 'cosine':
                similarities, discriminated = sim.pairwise_cos(
                    matrix, discriminated, plot_path=plotfile)
            else:
                similarities, discriminated = sim.pairwise_corr(
                    matrix, discriminated, plot_path=plotfile)

            df = sim.sim2df(similarities, discriminated)
            similarities_file = os.path.join(
                corpus_dir,
                '.'.join(['similarities', distance,
                          str(int(idx)), 'csv']))
            df.to_csv(similarities_file, sep='\t')

            clusters = sim.neighborhood(discriminated, similarities, nn=nn)
            json.dump(clusters, open(logfile, 'w'))
            print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.")

        accuracy, baseline_acc, h, baseline_h = sim.clustering_precision(
            clusters)
        accuracies[idx] = {
            'accuracy': accuracy,
            'baseline_acc': baseline_acc,
            'entropy': h,
            'baseline_entr': baseline_h
        }

    return accuracies
Exemplo n.º 9
0
def main():

    parser = argparse.ArgumentParser(
        description=
        "Assess whether words from the same category cluster together"
        "first considering their sound patterns and then how they correlate"
        "to each other based on their contexts of occurrence")

    parser.add_argument(
        "-c",
        "--corpus",
        required=True,
        dest="corpus",
        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument(
        "-C",
        "--celex_folder",
        required=True,
        dest="celex_folder",
        help="Specify the folder where the Celex data are located.")
    parser.add_argument(
        "-O",
        "--output_folder",
        required=True,
        dest="output_folder",
        help=
        "Specify the path of the folder where the logfiles will be stored together with"
        "the summary tables.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags."
    )
    parser.add_argument(
        "-p",
        "--precision",
        dest="precision",
        default=5,
        help=
        "Specify the number of outcomes to consider when computing discrimination's precision."
    )
    parser.add_argument(
        "-l",
        "--longitudinal",
        action="store_true",
        dest="longitudinal",
        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()

    cues = ['triphones']
    outcomes = ['tokens']
    stress_marker = [True]
    boundaries = [True]
    reduced_vowels = [False]
    distances = ['correlation']
    number_of_cues = [100, 500, 1000]
    number_of_tokens = [50, 250, 500]

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    summary_file = os.path.join(args.output_folder, "LDAk_summary.csv")
    # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv")

    parametrizations = it.product(cues, outcomes, stress_marker, boundaries,
                                  reduced_vowels, distances, number_of_cues,
                                  number_of_tokens)

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]
    rows = int(
        np.prod([
            len(cues),
            len(outcomes),
            len(stress_marker),
            len(reduced_vowels),
            len(boundaries),
            len(time_points),
            len(distances),
            len(number_of_cues),
            len(number_of_tokens)
        ]))
    summary_table = pd.DataFrame(index=np.arange(0, rows),
                                 columns=[
                                     "Corpus", "Cues", "Outcomes", "Stress",
                                     "Vowels", "Precision", "Time", "Distance",
                                     "numCues", "numTokens", "Phon_acc",
                                     "Phon_acc_subset", "Phon_baseline",
                                     "Distr_acc", "Distr_acc_subset",
                                     "Distr_baseline"
                                 ])

    ii = 0
    for parametrization in parametrizations:

        print(parametrization)

        cue, outcome, stress, boundary, reduced, distance, how_many_cues, how_many_tokens = parametrization

        uniphones = True if cue == 'uniphones' else False
        diphones = True if cue == 'diphones' else False
        triphones = True if cue == 'triphones' else False
        syllables = True if cue == 'syllables' else False
        vowels = 'reduced' if reduced else 'full'
        sm = "stress" if stress else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]

        encoded_corpus = corpus_encoder(args.corpus,
                                        args.celex_folder,
                                        args.pos_mapping,
                                        separator='~',
                                        stress_marker=stress,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        outcomes=outcome,
                                        boundaries=boundary)

        corpus_dir = os.path.dirname(encoded_corpus)

        a, b = [0.001, 0.001
                ] if training == 'aggregate_utterances' else [0.01, 0.01]
        file_paths = ndl(encoded_corpus,
                         alpha=a,
                         beta=b,
                         lam=1,
                         longitudinal=args.longitudinal)

        celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced)

        for idx, file_path in file_paths.items():

            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir, '.'.join([
                    'discriminatedOutcomes',
                    str(int(idx)), ''.join(['at', args.precision]), 'json'
                ]))

            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=int(args.precision))
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                "The discriminated outcomes have been identified (file: %s)." %
                discriminated_file)

            accuracies = subset_experiment(matrix,
                                           discriminated,
                                           how_many_cues=how_many_cues,
                                           how_many_tokens=how_many_tokens)

            summary_table.loc[ii] = pd.Series({
                "Corpus": training,
                "Cues": cue,
                "Outcomes": outcome,
                "Stress": sm,
                "Boundaries": bound,
                "Vowels": vowels,
                "Time": int(idx),
                "Distance": distance,
                "Precision": args.precision,
                "numCues": how_many_cues,
                "numTokens": how_many_tokens,
                "Phon_acc": accuracies[0],
                "Phon_acc_subset": accuracies[1],
                "Distr_acc": accuracies[3],
                "Distr_acc_subset": accuracies[4],
                "Phon_baseline": accuracies[2],
                "Distr_baseline": accuracies[5]
            })
            ii += 1

    if os.path.exists(summary_file):
        summary_table.to_csv(summary_file,
                             sep='\t',
                             index=False,
                             mode="a",
                             header=False)
    else:
        summary_table.to_csv(summary_file, sep='\t', index=False)
def main():

    parser = argparse.ArgumentParser(description="Assess whether words from the same category cluster together"
                                                 "on the basis of the sound sequences they consist of.")

    parser.add_argument("-c", "--corpus", required=True, dest="corpus",
                        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument("-C", "--Celex_folder", required=True, dest="celex_folder",
                        help="Specify the folder where the Celex data are located.")
    parser.add_argument("-O", "--output_folder", required=True, dest="output_folder",
                        help="Specify the path of the folder where the logfiles will be stored together with"
                             "the summary tables.")
    parser.add_argument("-M", "--pos_mapping", required=True, dest="pos_mapping",
                        help="Specify the path of the file containing the mapping from CHILDES to Celex PoS tags.")
    parser.add_argument("-p", "--precision", dest="precision", default=5,
                        help="Specify the number of outcomes to consider when computing discrimination's precision.")
    parser.add_argument("-l", "--longitudinal", action="store_true", dest="longitudinal",
                        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()
    at = args.precision

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)
    summary_file = os.path.join(args.output_folder, "lexicalDevelopment_summary.csv")

    a, b = [0.01, 0.01]
    reduced_vowels = [False]
    boundaries = [True]
    outcomes = ['tokens']
    cues = ['triphones', 'syllables']
    stress_marker = [True]

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]
    rows = int(np.prod([len(cues), len(outcomes), len(stress_marker), len(reduced_vowels),
                        len(boundaries), len(time_points)]))
    summary_table = pd.DataFrame(index=np.arange(0, rows),
                                 columns=["Corpus", "Boundaries", "Cues", "Outcomes", "Stress", "Vowels", "Time", "At",
                                          "Discriminated", "@".join(["Precision", str(at)]), "Jaccard@1", "Total"])

    row_id = 0
    parametrizations = it.product(reduced_vowels, boundaries, outcomes, cues, stress_marker)
    for parametrization in parametrizations:

        r, boundary, outcome, cue, marker = parametrization
        uniphones = True if cue == 'uniphones' else False
        diphones = True if cue == 'diphones' else False
        triphones = True if cue == 'triphones' else False
        syllables = True if cue == 'syllables' else False
        vowels = 'reduced' if r else 'full'
        sm = "stress" if marker else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]
        celex_dict = get_celex_dictionary(args.celex_folder, reduced=r)

        encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~',
                                        stress_marker=marker, reduced=r, outcomes=outcome, boundaries=boundary,
                                        uniphones=uniphones, diphones=diphones, triphones=triphones,
                                        syllables=syllables)

        cumulative_vocabulary = get_cumulative_vocabulary(encoded_corpus, time_points)
        print()
        print("The cumulative vocabulary for the file %s has been estimated" % encoded_corpus)
        print()

        file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal)

        for idx, file_path in file_paths.items():
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the Jaccard coefficient for each outcome and select those with a coefficient of 1, meaning that the
            # model would choose all and only the correct cues when expressing an the outcome; get the number of such
            # outcomes
            print()
            jaccard_coefficients = jaccard(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=marker,
                                           uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables,
                                           boundaries=boundaries)
            jaccard_one = {}
            for token in outcomes2ids:
                if jaccard_coefficients[token] == 1:
                    jaccard_one[token] = outcomes2ids[token]
            n_jaccard = len(jaccard_one)
            print()

            # get the outcomes that are correctly discriminated given the cues they consist of: in detail, take an
            # outcome, encode it in its phonetic cues, check which outcomes are most active given such cues, check
            # whether the correct one is among the top ones (how many is indicated by the parameter 'at'; store all
            # outcomes where the correct one is among the top active ones given the cues in it
            print()
            precise = precision_at(matrix, outcomes2ids, cues2ids, celex_dict, stress_marker=marker,
                                   uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables,
                                   boundaries=boundaries, at=at)
            n_precise = len(precise)
            print()

            # repeat but only for the outcomes with a Jaccar coefficient of 1, to quantify two-way discrimination
            print()
            discriminated = precision_at(matrix, jaccard_one, cues2ids, celex_dict, stress_marker=marker,
                                         uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables,
                                         boundaries=boundaries, at=at)
            n_discriminated = len(discriminated)
            print()

            vocabulary_estimate = cumulative_vocabulary[int(idx)]

            summary_table.loc[row_id] = pd.Series({"Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm,
                                                   "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "At": at,
                                                   "Discriminated": n_discriminated, "Total": vocabulary_estimate,
                                                   "@".join(["Precision", str(at)]): n_precise, "Jaccard@1": n_jaccard})
            row_id += 1

    if os.path.exists(summary_file):
        summary_table.to_csv(summary_file, sep='\t', index=False, mode="a",
                             header=False)
    else:
        summary_table.to_csv(summary_file, sep='\t', index=False)
def tag_words(input_file,
              test_set,
              celex_dir,
              pos_mapping,
              output_folder,
              method='freq',
              evaluation='count',
              k=50,
              flush=0,
              threshold=0,
              separator='~',
              reduced=False,
              outcomes='tokens',
              boundaries=True,
              uniphones=True,
              diphones=False,
              triphones=False,
              syllable=False,
              stress_marker=False,
              alpha=0.01,
              beta=0.01,
              lam=1.0,
              longitudinal=False,
              at=5):
    """
    :param input_file:          a .json file containing transcripts of child-caregiver interactions extracted from the
                                CHILDES database. The json file consists of two lists of lists, of the same length,
                                both contain utterances but encoded differently. The first encodes each utterance as a
                                list of tokens; the second encodes each utterance as a list of lemmas and
                                Part-of-Speech tags, joined by a vertical bar ('|')
    :param test_set:            a dictionary mapping the file name to:
                                - 'test_set['filename']: the basename of the file
                                - 'test_set['items']: the set of phonological forms to be categorized, complete of the
                                   target PoS tag (phonological form and PoS tag are separated by a vertical bar ('|')
    :param celex_dir:           a string specifying the path to the Celex directory
    :param pos_mapping:         a .txt file mapping CHILDES PoS tags to CELEX tags
    :param output_folder:       the path to the folder where the logfiles will be saved
    :param method:              a string indicating the way in which the function looks at top active outcomes; two
                                options are available:
                                - 'freq' makes the function compute the distribution of PoS tags over the k top active
                                    nodes (see the explanation of the parameter k) and rank PoS tags according to their
                                    frequency among the top active cues
                                - 'sum' makes the function compute the sum of activation from all outcomes belonging to
                                    a given PoS tag within the k top active outcomes given the input cues, and rank PoS
                                     tags according to their total activation among the top active cues
    :param evaluation:          a string indicating how to compare baseline activations to item-triggered ones; two
                                options are available:
                                - 'count', simply tag the test item with the PoS tag that either was more frequent or
                                    had highest summed activation within the top active outcomes; frequency or
                                    activation are returned and can be correlated to reaction times
                                - 'distr', compare the frequency counts or summed activations generated by a specific
                                    test item to the frequency counts or summed activations at baseline and tag the
                                    test item with the PoS tag receiving highest support by the change in the
                                    distribution of frequencies or summed activations (a statistic is returned,
                                    Chi-squared for frequency distributions and t-test for summed activations, whose
                                    value can be correlated to reaction times)
    :param k:                   an integer specifying how many elements to consider from the baseline activations and
                                the activations triggered by a specific test item. By default, the top 50 outcomes are
                                considered, and compared according to the chosen combination of method and eval
    :param flush:               specify whether (and how many) top active outcome at baseline to flush away from
                                subsequent computations. It may be the case that whatever the input cues, the same high
                                frequency outcomes come out as being the most active. It may then make sense to not
                                consider them when evaluating the distribution of lexical categories over the most
                                active outcomes given an input item
    :param threshold:           the minimum activation of an outcome to be considered in the list of top activated
                                neighbors, default is 0 and shouldn't be lowered, but can be increased.
    :param separator:           the character that separates the word baseform from its PoS tag in the input corpus
    :param reduced:             a boolean specifying whether reduced phonological forms should be extracted from Celex
                                whenever possible (if set to True) or if standard phonological forms should be
                                preserved (if False)
    :param outcomes:            a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas'
    :param boundaries:          a boolean specifying whether word boundaries are to be considered when training on full
                                utterances
    :param uniphones:          a boolean indicating whether single phonemes are to be considered while encoding input
                                utterances
    :param diphones:           a boolean indicating whether sequences of two phonemes are to be considered while
                                encoding input utterances
    :param triphones:          a boolean indicating whether sequences of three phonemes are to be considered while
                                encoding input utterances
    :param syllable:            a boolean indicating whether syllables are to be considered while encoding input
                                utterances
    :param stress_marker:       a boolean indicating whether stress markers from the phonological representations of
                                Celex need to be preserved or can be discarded
    :param alpha:               a number indicating cue salience. For simplicity, we assume that every cue has the same
                                salience, so changing the value of this parameter does not affect the relative strength
                                of of cue-outcome associations but only their absolute magnitude
    :param beta:                a number indicating the learning rate for positive and negative situations. Again, we
                                make the simplifying assumption that our simulated learners are equally affected by
                                positive and negative feedback. Changing the beta value can have a significant impact
                                on the learning outcome, but 0.1 is a standard choice for this model. If the number of
                                learning trials or the number of different cues in a learning trial are very large,
                                both beta and alpha need to be lowered considerably
    :param lam:                 maximum amount of association that an outcome can receive from all the cues. It simply
                                acts as a scaling factor, so changing its value has the same effects of changing alpha
    :param longitudinal:        a boolean specifying whether to work in a longitudinal setting or not
    :param at:                  the number of top active outcomes to consider to compute precision
    :return accuracies:         a dictionary mapping the categorization accuracy on the PoS tagging experiment to each
                                time index (1 if the longitudinal parameter is set to False, 10 if it's set to True)
    :return entropies:          a dictionary mapping the normalized entropy of the distribution of the PoS tags
                                assigned by the model to each time index (1 if the longitudinal parameter is set to
                                False, 10 if it's set to True)
    :return most_frequents:     a dictionary mapping the PoS tag that was applied the most by the model to each time
                                index (1 if the longitudinal parameter is set to False, 10 if it's set to True)
    :return frequencies:        a dictionary mapping the frequency count of the most frequent PoS tag applied by the
                                model, to each time index (1 if the longitudinal parameter is set to False, 10 if it's
                                set to True)
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    encoded_corpus = corpus_encoder(input_file,
                                    celex_dir,
                                    pos_mapping,
                                    separator=separator,
                                    uniphones=uniphones,
                                    diphones=diphones,
                                    triphones=triphones,
                                    syllables=syllable,
                                    stress_marker=stress_marker,
                                    reduced=reduced,
                                    outcomes=outcomes,
                                    boundaries=boundaries)

    file_paths = ndl(encoded_corpus,
                     alpha=alpha,
                     beta=beta,
                     lam=lam,
                     longitudinal=longitudinal)

    # for each test item, compute the items from the matrix of weights that are most activated given the cues in the
    # item, get the PoS tag that is most present among the most active lexical nodes and check whether the predicted
    # PoS tag matches the gold-standard one provided along the test item. Return a global score indicating the accuracy
    # on the test set
    accuracies = {}
    entropies = {}
    most_frequents = {}
    frequencies = {}
    log_dicts = {}

    celex_dict = get_celex_dictionary(celex_dir, reduced=reduced)

    for idx, file_path in file_paths.items():

        logfile = make_log_file(input_file,
                                test_set['filename'],
                                output_folder,
                                method,
                                evaluation,
                                flush,
                                k,
                                at,
                                idx,
                                reduced=reduced,
                                uniphones=uniphones,
                                diphones=diphones,
                                triphones=triphones,
                                syllables=syllable,
                                stress_marker=stress_marker,
                                outcomes=outcomes,
                                boundaries=boundaries)

        if os.path.exists(logfile):
            print()
            print(
                "The file %s already exists, statistics for the corresponding parametrization are loaded from it"
                % logfile)
            log_dict = json.load(open(logfile, "r"))

        else:
            print()
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those:
            # - whose jaccard coefficient between true phonetic cues and most active phonetic cued for the outcome is 1
            # - and that appear in the top active outcomes given the cues they consist of
            corpus_folder = os.path.dirname(encoded_corpus)
            discriminated_file = os.path.join(
                corpus_folder,
                '.'.join(['discriminatedOutcomes',
                          str(int(idx)), 'json']))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllable,
                                                   boundaries=boundaries,
                                                   at=at)
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Start test phase, using %s as weight matrix and %s as test set..."
                % (os.path.basename(file_path),
                   os.path.basename(test_set['filename'])))

            log_dict = categorize(test_set['items'],
                                  matrix,
                                  cues2ids,
                                  discriminated,
                                  method=method,
                                  evaluation=evaluation,
                                  flush=flush,
                                  k=k,
                                  threshold=threshold,
                                  stress_marker=stress_marker,
                                  syllables=syllable,
                                  uniphones=uniphones,
                                  diphones=diphones,
                                  triphones=triphones,
                                  boundaries=boundaries)
            json.dump(log_dict, open(logfile, 'w'))

            print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.")

        f1, h, pos, freq = compute_summary_statistics(log_dict)

        accuracies[idx] = f1
        entropies[idx] = h
        most_frequents[idx] = pos
        frequencies[idx] = freq
        log_dicts[idx] = log_dict

        print("Accuracy: %0.5f" % f1)
        print()

    return log_dicts, accuracies, entropies, most_frequents, frequencies
def write_learning_events(corpus_file,
                          output_folder,
                          celex_dir,
                          pos_dict,
                          separator='~',
                          uni_phones=False,
                          di_phones=False,
                          tri_phones=True,
                          syllable=False,
                          stress_marker=True,
                          boundaries=False):
    """
    :param corpus_file:         a path pointing to .json object to be used as input corpus, consisting of two aligned
                                lists of lists, meaning that a second-order list in each first order list refers to a
                                same utterance; the first list contains utterances encoded as lists of tokens, the
                                second list contains utterances encoded as lists of lemmas and PoS tags
    :param output_folder:       the path to a folder where the output files for cues and outcomes will be written to
    :param celex_dir:           the path to the directory where the Celex dictionary is to be found (if no dictionary
                                is found at the given location, one is built on the fly
    :param pos_dict:            a dictionary mapping CHILDES PoS tags to corresponding Celex PoS tags
    :param separator:           a string indicating the character separating lemmas from PoS tags in the input corpus
    :param uni_phones:          a boolean indicating whether uni-phones are relevant phonetic cues
    :param di_phones:           a boolean indicating whether di-phones are relevant phonetic cues
    :param tri_phones:          a boolean indicating whether tri-phones are relevant phonetic cues
    :param syllable:            a boolean indicating whether syllables are relevant phonetic cues
    :param stress_marker:       a boolean indicating whether to discard or not the stress marker from the Celex phonetic
                                transcriptions
    """

    celex_dict = get_celex_dictionary(celex_dir, reduced=False)
    tokens2identifiers = tokens2ids(celex_dict)
    pos_dict = get_pos_mapping(pos_dict)
    corpus = json.load(open(corpus_file, 'r+'))

    # use the path of the input file to generate the path of the output file, adding encoding information to the
    # input filename; print to standard output a summary of all the encoding parameters
    input_filename, extension = os.path.splitext(corpus_file)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    cue_file = os.path.join(output_folder, "cues.txt")
    outcome_file = os.path.join(output_folder, "outcomes.txt")

    # check whether the output file corresponding to the desired parameters already exist and stop if it does
    if os.path.isfile(cue_file) and os.path.isfile(outcome_file):
        print()
        print(
            "The desired encoded version of the input corpus '%s' already exists at files '%s' and '%s'."
            % (os.path.basename(corpus_file), os.path.basename(cue_file),
               os.path.basename(outcome_file)))
        return cue_file, outcome_file
    else:

        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Started encoding utterances from input corpus '%s'" %
            corpus_file)

        # get the corpus recoded into phonological cues and lexical outcomes
        cues, outcomes = encode_corpus(corpus,
                                       celex_dict,
                                       tokens2identifiers,
                                       pos_dict,
                                       separator=separator,
                                       uni_phones=uni_phones,
                                       di_phones=di_phones,
                                       tri_phones=tri_phones,
                                       syllable=syllable,
                                       stress_marker=stress_marker,
                                       boundaries=boundaries)
        print()
        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Finished encoding utterances from input corpus '%s'" %
            corpus_file)
        print()

        corpus2txt(cues, cue_file)
        print()
        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Written encoded cues to '%s'" % cue_file)
        print()

        corpus2txt(outcomes, outcome_file)
        print()
        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Written encoded outcomes to '%s'" % outcome_file)
        print()