Python corpus_encoder 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: corpus.encoder

메소드/함수: corpus_encoder

hotexamples.com에서의 예제들: 7

Python corpus_encoder - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 corpus.encoder.corpus_encoder에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: encode_corpus.py 프로젝트: GiovanniCassani/discriminative_learning

def main():

    parser = argparse.ArgumentParser(
        description='Process arguments to create Celex dictionary.')

    parser.add_argument(
        "-I",
        "--input_file",
        required=True,
        dest="in_file",
        help="Specify the corpus to be used as input (encoded as .json).")
    parser.add_argument("-C",
                        "--Celex_dir",
                        required=True,
                        dest="celex_dir",
                        help="Specify the path to the Celex directory.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path to the file containing the mapping between CHILDES and Celex PoS tags."
    )
    parser.add_argument(
        "-S",
        "--separator",
        dest="sep",
        default='~',
        help=
        "Specify the character separating lemma and PoS tag in the input corpus."
    )
    parser.add_argument(
        "-o",
        "--outcomes",
        dest="outcomes",
        default='tokens',
        help=
        "Specify whethere to use 'lemmas' or 'tokens' (default) as lexical outcomes."
    )
    parser.add_argument("-u",
                        "--uniphones",
                        action="store_true",
                        dest="uni",
                        help="Specify if uniphones need to be encoded.")
    parser.add_argument("-d",
                        "--diphones",
                        action="store_true",
                        dest="di",
                        help="Specify if diphones need to be encoded.")
    parser.add_argument("-t",
                        "--triphones",
                        action="store_true",
                        dest="tri",
                        help="Specify if triphones need to be encoded.")
    parser.add_argument("-s",
                        "--syllables",
                        action="store_true",
                        dest="syl",
                        help="Specify if syllables need to be encoded.")
    parser.add_argument("-m",
                        "--stress_marker",
                        action="store_true",
                        dest="stress",
                        help="Specify if stress need to be encoded.")
    parser.add_argument(
        "-r",
        "--reduced",
        action="store_true",
        dest="reduced",
        help=
        "Specify if reduced vowels are to be considered when extracting CELEX phonetic forms."
    )
    parser.add_argument(
        "-b",
        "--boundaries",
        action="store_true",
        dest="boundaries",
        help=
        "Specify whether word boundaries are to be considered when training on utterances."
    )

    args = parser.parse_args()

    check_arguments(args, parser)

    corpus_encoder(args.in_file,
                   args.celex_dir,
                   args.pos_mapping,
                   separator=args.sep,
                   uniphones=args.uni,
                   diphones=args.di,
                   triphones=args.tri,
                   syllables=args.syl,
                   stress_marker=args.stress,
                   reduced=args.reduced,
                   outcomes=args.outcomes,
                   boundaries=args.boundaries)

예제 #2

파일 보기

파일: experiment4_usefulness.py 프로젝트: GiovanniCassani/discriminative_learning

def main():

    parser = argparse.ArgumentParser(
        description=
        "Compute the variance of each phonological cue and token, as a proxy "
        "to identify the amount of information they carry")

    parser.add_argument(
        "-c",
        "--corpus",
        required=True,
        dest="corpus",
        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument(
        "-C",
        "--celex_folder",
        required=True,
        dest="celex_folder",
        help="Specify the folder where the Celex data are located.")
    parser.add_argument(
        "-O",
        "--output_folder",
        required=True,
        dest="output_folder",
        help=
        "Specify the path of the folder where the logfiles will be stored together with"
        "the summary tables.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags."
    )
    parser.add_argument(
        "-p",
        "--precision",
        dest="precision",
        default=5,
        help=
        "Specify the number of outcomes to consider when computing discrimination's precision."
    )
    parser.add_argument(
        "-l",
        "--longitudinal",
        action="store_true",
        dest="longitudinal",
        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()

    cues = ['triphones']
    outcomes = ['tokens']
    stress_marker = [True]
    boundaries = [True]
    reduced_vowels = [False]
    number_of_cues = [100, 1000]

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    cues_variance_file = os.path.join(args.output_folder,
                                      "words_cues_variance.csv")
    tokens_variance_file = os.path.join(args.output_folder,
                                        "words_tokens_variance.csv")

    parametrizations = it.product(cues, outcomes, stress_marker, boundaries,
                                  reduced_vowels)

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]

    cues_table = pd.DataFrame(
        index=[],
        columns=[
            "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision",
            "Time", "Phonological_cue", "Variance", "Frequency",
            "Lexical_diversity", "Phonological_diversity",
            "Cue|Cues_predictability", "Cue|Tokens_predictability",
            "Cues|Cue_predictability", "Tokens|Cue_predictability"
        ])

    tokens_table = pd.DataFrame(
        index=[],
        columns=[
            "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision",
            "Time", "numCues", 'Token', "Variance", "Frequency",
            "Lexical_diversity", "Phonological_diversity",
            "Token|Tokens_predictability", "Token|Cues_predictability",
            "Tokens|Token_predictability", "Cues|Token_predictability"
        ])

    ii = 0
    jj = 0

    for parametrization in parametrizations:

        print(parametrization)

        cue_type, outcome, stress, boundary, reduced = parametrization

        uniphones = True if cue_type == 'uniphones' else False
        diphones = True if cue_type == 'diphones' else False
        triphones = True if cue_type == 'triphones' else False
        syllables = True if cue_type == 'syllables' else False
        vowels = 'reduced' if reduced else 'full'
        sm = "stress" if stress else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]

        encoded_corpus = corpus_encoder(args.corpus,
                                        args.celex_folder,
                                        args.pos_mapping,
                                        separator='~',
                                        stress_marker=stress,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        outcomes=outcome,
                                        boundaries=boundary)

        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Started computing distributional statistics from the corpus...")
        token_statistics, cue_statistics = usf.compute_distributional_predictors(
            encoded_corpus, time_points)
        print(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": ...finished computing distributional statistics from the corpus."
        )
        print()

        corpus_dir = os.path.dirname(encoded_corpus)

        a, b = [0.001, 0.001
                ] if training == 'aggregate_utterances' else [0.01, 0.01]
        file_paths = ndl(encoded_corpus,
                         alpha=a,
                         beta=b,
                         lam=1,
                         longitudinal=args.longitudinal)

        celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced)

        for idx, file_path in file_paths.items():

            idx = int(idx)
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir, '.'.join([
                    'discriminatedOutcomes',
                    str(idx), ''.join(['at', args.precision]), 'json'
                ]))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=int(args.precision))
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": The discriminated outcomes have been identified (file: %s)."
                % discriminated_file)
            print()

            row_variances, matrix, discriminated = usf.get_cue_variances(
                matrix, discriminated)
            cue_variances = {}
            for cue in cues2ids:
                cue_idx = cues2ids[cue]
                cue_variances[cue] = row_variances[cue_idx]

            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Started storing cue variances...")
            for cue in cue_variances:
                if len(cue_statistics[cue]['freq']) == 10:

                    frequency = cue_statistics[cue]['freq'][idx]
                    lexical_diversity = cue_statistics[cue]['lexdiv'][idx]
                    phonological_diversity = cue_statistics[cue]['phondiv'][
                        idx]

                    # average conditional probability of a cue given the co-occurring cues
                    cue_cues_predictability = cue_statistics[cue][
                        'p_cue_cues'][idx]

                    # average predictive power of a cue with respect to all the co-occurring cues
                    cues_cue_predictability = cue_statistics[cue][
                        'p_cues_cue'][idx]

                    # average conditional probability of a cue given the co-occurring tokens
                    cue_tokens_predictability = cue_statistics[cue][
                        'p_cue_tokens'][idx]

                    # average predictive power of a cue with respect to all the co-occurring tokens
                    tokens_cue_predictability = cue_statistics[cue][
                        'p_tokens_cue'][idx]

                    cues_table.loc[ii] = pd.Series({
                        "Corpus":
                        training,
                        "Cues":
                        cue_type,
                        "Outcomes":
                        outcome,
                        "Stress":
                        sm,
                        "Boundaries":
                        bound,
                        "Vowels":
                        vowels,
                        "Time":
                        idx,
                        "Precision":
                        int(args.precision),
                        "Phonological_cue":
                        cue,
                        "Variance":
                        cue_variances[cue],
                        "Frequency":
                        frequency,
                        "Lexical_diversity":
                        lexical_diversity,
                        "Phonological_diversity":
                        phonological_diversity,
                        "Cue|Cues_predictability":
                        cue_cues_predictability,
                        "Cues|Cue_predictability":
                        cues_cue_predictability,
                        "Cue|Tokens_predictability":
                        cue_tokens_predictability,
                        "Tokens|Cue_predictability":
                        tokens_cue_predictability
                    })
                ii += 1

            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": ...finished storing cue variances.")
            print()

            for how_many_cues in number_of_cues:

                print("Number of cues: ", how_many_cues)

                token_variances = usf.get_token_variances(
                    matrix,
                    discriminated,
                    row_variances,
                    how_many_cues=how_many_cues)

                print(
                    strftime("%Y-%m-%d %H:%M:%S") +
                    ": Started storing token variances...")
                for token in token_variances:
                    if len(token_statistics[token]['freq']) == 10:

                        frequency = token_statistics[token]['freq'][idx]
                        lexical_diversity = token_statistics[token]['lexdiv'][
                            idx]
                        phonological_diversity = token_statistics[token][
                            'phondiv'][idx]

                        # average conditional probability of a token given the co-occurring tokens
                        token_tokens_predictability = token_statistics[token][
                            'p_token_tokens'][idx]

                        # average predictive power of a token with respect to the co-occurring tokens
                        tokens_token_predictability = token_statistics[token][
                            'p_tokens_token'][idx]

                        # average conditional probability of a token given the co-occurring phonological cues
                        token_cues_predictability = token_statistics[token][
                            'p_token_cues'][idx]

                        # average predictive power of a token with respect to the co-occurring phonological cues
                        cues_token_predictability = token_statistics[token][
                            'p_cues_token'][idx]

                        tokens_table.loc[jj] = pd.Series({
                            "Corpus":
                            training,
                            "Cues":
                            cue_type,
                            "Outcomes":
                            outcome,
                            "Stress":
                            sm,
                            "Boundaries":
                            bound,
                            "Vowels":
                            vowels,
                            "Time":
                            idx,
                            "numCues":
                            how_many_cues,
                            "Precision":
                            int(args.precision),
                            "Token":
                            token,
                            "Variance":
                            token_variances[token],
                            "Frequency":
                            frequency,
                            "Lexical_diversity":
                            lexical_diversity,
                            "Phonological_diversity":
                            phonological_diversity,
                            "Token|Tokens_predictability":
                            token_tokens_predictability,
                            "Tokens|Token_predictability":
                            tokens_token_predictability,
                            "Token|Cues_predictability":
                            token_cues_predictability,
                            "Cues|Token_predictability":
                            cues_token_predictability
                        })
                    jj += 1
                print(
                    strftime("%Y-%m-%d %H:%M:%S") +
                    ": ...finished storing token variances.")

                print()
                print('-' * 100)
                print()

            print()
            print()
            print('=' * 100)
            print('=' * 100)
            print()
            print()

        print()
        print()
        print()
        print('#' * 100)
        print('#' * 100)
        print('#' * 100)
        print()
        print()
        print()

    if os.path.exists(cues_variance_file):
        cues_table.to_csv(cues_variance_file,
                          sep='\t',
                          index=False,
                          mode="a",
                          header=False)
    else:
        cues_table.to_csv(cues_variance_file, sep='\t', index=False)

    if os.path.exists(tokens_variance_file):
        tokens_table.to_csv(tokens_variance_file,
                            sep='\t',
                            index=False,
                            mode="a",
                            header=False)
    else:
        tokens_table.to_csv(tokens_variance_file, sep='\t', index=False)

예제 #3

파일 보기

def main():

    parser = argparse.ArgumentParser(
        description=
        "Assess whether words from the same category cluster together"
        "first considering their sound patterns and then how they correlate"
        "to each other based on their contexts of occurrence")

    parser.add_argument(
        "-c",
        "--corpus",
        required=True,
        dest="corpus",
        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument(
        "-C",
        "--celex_folder",
        required=True,
        dest="celex_folder",
        help="Specify the folder where the Celex data are located.")
    parser.add_argument(
        "-O",
        "--output_folder",
        required=True,
        dest="output_folder",
        help=
        "Specify the path of the folder where the logfiles will be stored together with"
        "the summary tables.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags."
    )
    parser.add_argument(
        "-p",
        "--precision",
        dest="precision",
        default=5,
        help=
        "Specify the number of outcomes to consider when computing discrimination's precision."
    )
    parser.add_argument(
        "--cue_threshold",
        dest="cue_threshold",
        default='high',
        help=
        "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant cues or"
        "a low (i.e. lax) one.")
    parser.add_argument(
        "--token_threshold",
        dest="token_threshold",
        default='low',
        help=
        "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant tokens or"
        "a low (i.e. lax) one.")
    parser.add_argument(
        "-l",
        "--longitudinal",
        action="store_true",
        dest="longitudinal",
        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()

    # thresholds have been determined manually according to the following criteria:
    # - high thresholds were set to yield around 100 dimensions at time t100
    # - low thresholds were set to yield around 100 dimensions at time t10
    # Whenever a threshold value didn't yield any dimension because the value was too stringent, the threshold was
    # lowered until at least 1 dimension was available at all time points. First, I adjusted the threshold on
    # phonological cues' variance, then on tokens' variance. Practically, it was always possible to set thresholds to
    # yield around 100 dimensions at the specified time points, except for tokens' variance in the low variance setting,
    # wheere the threshold yielding around 100 dimensions at t10 quickly left the model without any dimension.
    # Therefore, these models start with considerably high dimensionalities, and finishes with almost no dimensionans.

    thresholds = {
        'aggregate_utterances_at5_c_low':
        0.00000001,  # 99 cues at t10 (799 at t100)
        'aggregate_utterances_at5_c_high':
        0.00000075,  # 105 cues at t100 (2 cues at t10)
        'aggregate_utterances_at5_t_low':
        0.02,  # 741 tokens at t10 (3 tokens at t100)
        'aggregate_utterances_at5_t_high':
        0.04,  # 125 tokens at 100 (843 tokens at t10)
        'aggregate_utterances_at25_c_low':
        0.000000005,  # 98 cues at t10 (802 cues at t100)
        'aggregate_utterances_at25_c_high':
        0.00000025,  # 156 cues at t100 (2 cues at t10)
        'aggregate_utterances_at25_t_low':
        0.015,  # 1632 tokens at t10 (19 tokens at t100)
        'aggregate_utterances_at25_t_high':
        0.033,  # 140 tokens at t100 (1731 at t10)
        'aggregate_words_at5_c_low':
        0.000025,  # 101 cues at t10 (318 cues at t100)
        'aggregate_words_at5_c_high':
        0.00005,  # 131 cues at t100 (40 cues at t10)
        'aggregate_words_at5_t_low':
        0.0325,  # 834 tokens at t10 (3 tokens at t100)
        'aggregate_words_at5_t_high':
        0.07,  # 95 tokens at t100 (1004 tokens at t10)
        'aggregate_words_at25_c_low':
        0.0000075,  # 117 cues at t10 (416 cues at t100)
        'aggregate_words_at25_c_high':
        0.00002,  # 124 cues at t100 (33 cues at t10)
        'aggregate_words_at25_t_low':
        0.0295,  # 1850 tokens at t10 (7 tokens at t100)
        'aggregate_words_at25_t_high': 0.085  # 113 tokens at t100 (2581)
    }

    cues = ['triphones']
    outcomes = ['tokens']
    stress_marker = [True]
    boundaries = [True]
    reduced_vowels = [False]
    distances = ['correlation']

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    summary_file = os.path.join(args.output_folder, "LDAt_summary.csv")
    # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv")

    parametrizations = it.product(cues, outcomes, stress_marker, boundaries,
                                  reduced_vowels, distances)

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]
    rows = int(
        np.prod([
            len(cues),
            len(outcomes),
            len(stress_marker),
            len(reduced_vowels),
            len(boundaries),
            len(time_points),
            len(distances)
        ]))
    summary_table = pd.DataFrame(
        index=np.arange(0, rows),
        columns=[
            "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision",
            "Time", "Distance", "tCues", "numCues", "tTokens", "numTokens",
            "Phon_acc", "Phon_acc_subset", "Phon_baseline", "Distr_acc",
            "Distr_acc_subset", "Distr_baseline"
        ])

    ii = 0
    for parametrization in parametrizations:

        cue, outcome, stress, boundary, reduced, distance = parametrization

        uniphones = True if cue == 'uniphones' else False
        diphones = True if cue == 'diphones' else False
        triphones = True if cue == 'triphones' else False
        syllables = True if cue == 'syllables' else False
        vowels = 'reduced' if reduced else 'full'
        sm = "stress" if stress else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]

        encoded_corpus = corpus_encoder(args.corpus,
                                        args.celex_folder,
                                        args.pos_mapping,
                                        separator='~',
                                        stress_marker=stress,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        outcomes=outcome,
                                        boundaries=boundary)

        corpus_dir = os.path.dirname(encoded_corpus)

        # precision at 25
        a, b = [0.001, 0.001
                ] if training == 'aggregate_utterances' else [0.01, 0.01]
        c = thresholds['_'.join([
            training, ''.join(['at', args.precision]), 'c', args.cue_threshold
        ])]
        t = thresholds['_'.join([
            training, ''.join(['at', args.precision]), 't',
            args.token_threshold
        ])]

        file_paths = ndl(encoded_corpus,
                         alpha=a,
                         beta=b,
                         lam=1,
                         longitudinal=args.longitudinal)

        celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced)

        for idx, file_path in sorted(file_paths.items(),
                                     key=operator.itemgetter(0)):

            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir, '.'.join([
                    'discriminatedOutcomes',
                    str(int(idx)), ''.join(['at', args.precision]), 'json'
                ]))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=args.precision)
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                "The discriminated outcomes have been identified (file: %s)." %
                discriminated_file)

            accuracies = threshold_experiment(matrix,
                                              discriminated,
                                              cues_threshold=c,
                                              tokens_threshold=t)

            summary_table.loc[ii] = pd.Series({
                "Corpus": training,
                "Cues": cue,
                "Outcomes": outcome,
                "Stress": sm,
                "Boundaries": bound,
                "Vowels": vowels,
                "Time": int(idx),
                "Distance": distance,
                "Precision": args.precision,
                "tCues": args.cue_threshold,
                "numCues": accuracies[3],
                "tTokens": args.token_threshold,
                "numTokens": accuracies[7],
                "Phon_acc": accuracies[0],
                "Phon_acc_subset": accuracies[1],
                "Distr_acc": accuracies[4],
                "Distr_acc_subset": accuracies[5],
                "Phon_baseline": accuracies[2],
                "Distr_baseline": accuracies[6]
            })
            ii += 1

    if os.path.exists(summary_file):
        summary_table.to_csv(summary_file,
                             sep='\t',
                             index=False,
                             mode="a",
                             header=False)
    else:
        summary_table.to_csv(summary_file, sep='\t', index=False)

예제 #4

파일 보기

def cluster_words(corpus,
                  output_folder,
                  celex_folder,
                  pos_mapping,
                  distance='cosine',
                  reduced=False,
                  outcomes='tokens',
                  uniphones=False,
                  diphones=False,
                  triphones=True,
                  syllables=False,
                  stress_marker=True,
                  boundaries=True,
                  at=5,
                  nn=25,
                  a=0.01,
                  b=0.01,
                  longitudinal=False):
    """
    :param corpus:          the corpus to be used for training the model
    :param output_folder:   the folder where the logfile of the clustering experiment will be saved
    :param celex_folder:    the folder containing the data from the Celex database
    :param pos_mapping:     the path to the file mapping CHILDES pos tags to Celex tags
    :param distance:        a string (either 'correlation' or 'cosine' indicating which distance metric to use
    :param reduced:         a boolean indicating whether to use reduced or full phonetic transcriptions from Celex
    :param outcomes:        a string (either 'tokens' or 'lemmas') indicating which outcomes to consider for learning
    :param uniphones:       a boolean indicating whether to consider uniphones as cues
    :param diphones:        a boolean indicating whether to consider diphones as cues
    :param triphones:       a boolean indicating whether to consider triphones as cues
    :param syllables:       a boolean indicating whether to consider syllables as cues
    :param stress_marker:   a boolean indicating whether to consider or discard stress information
    :param boundaries:      a boolean indicating whether to consider or discard word boundaries
    :param at:              an integer indicating how many outcomes to compute to compute discrimination's precision
    :param nn:              an integer indicating how many nearest neighbors to consider when evaluating clustering
    :param a:               the alpha parameter from the Rescorla Wagner model
    :param b:               the beta parameter from the Rescorla Wagner model
    :param longitudinal:    a boolean indicating whether to adopt a longitudinal design or not
    :return accuracies:     a dictionary mapping time indices to the clustering accuracy obtained at that time point
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    plot_folder = os.path.join(output_folder, 'plots')
    if not os.path.exists(plot_folder):
        os.makedirs(plot_folder)

    encoded_corpus = corpus_encoder(corpus,
                                    celex_folder,
                                    pos_mapping,
                                    separator='~',
                                    stress_marker=stress_marker,
                                    reduced=reduced,
                                    uniphones=uniphones,
                                    diphones=diphones,
                                    triphones=triphones,
                                    syllables=syllables,
                                    outcomes=outcomes,
                                    boundaries=boundaries)

    corpus_dir = os.path.dirname(encoded_corpus)

    file_paths = ndl(encoded_corpus,
                     alpha=a,
                     beta=b,
                     lam=1,
                     longitudinal=longitudinal)

    celex_dict = get_celex_dictionary(celex_folder, reduced=reduced)

    accuracies = {}
    for idx, file_path in file_paths.items():

        logfile = make_log_file(corpus,
                                output_folder,
                                'json',
                                dist=distance,
                                nn=nn,
                                at=at,
                                time=idx,
                                outcomes=outcomes,
                                reduced=reduced,
                                stress_marker=stress_marker,
                                boundaries=boundaries,
                                syllables=syllables,
                                uniphones=uniphones,
                                diphones=diphones,
                                triphones=triphones)
        plotfile = make_log_file(corpus,
                                 plot_folder,
                                 'pdf',
                                 dist=distance,
                                 nn=nn,
                                 at=at,
                                 time=idx,
                                 outcomes=outcomes,
                                 reduced=reduced,
                                 stress_marker=stress_marker,
                                 boundaries=boundaries,
                                 syllables=syllables,
                                 uniphones=uniphones,
                                 diphones=diphones,
                                 triphones=triphones)

        if os.path.exists(logfile):
            print()
            print(
                "The file %s already exists, statistics for the corresponding "
                "parametrization are loaded from it" % logfile)
            clusters = json.load(open(logfile, "r"))

        else:
            print()
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir,
                '.'.join(['discriminatedOutcomes',
                          str(int(idx)), 'json']))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=at)
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Start clustering, using %s as weight matrix..." %
                (os.path.basename(file_path)))
            if distance == 'cosine':
                similarities, discriminated = sim.pairwise_cos(
                    matrix, discriminated, plot_path=plotfile)
            else:
                similarities, discriminated = sim.pairwise_corr(
                    matrix, discriminated, plot_path=plotfile)

            df = sim.sim2df(similarities, discriminated)
            similarities_file = os.path.join(
                corpus_dir,
                '.'.join(['similarities', distance,
                          str(int(idx)), 'csv']))
            df.to_csv(similarities_file, sep='\t')

            clusters = sim.neighborhood(discriminated, similarities, nn=nn)
            json.dump(clusters, open(logfile, 'w'))
            print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.")

        accuracy, baseline_acc, h, baseline_h = sim.clustering_precision(
            clusters)
        accuracies[idx] = {
            'accuracy': accuracy,
            'baseline_acc': baseline_acc,
            'entropy': h,
            'baseline_entr': baseline_h
        }

    return accuracies

예제 #5

파일 보기

def main():

    parser = argparse.ArgumentParser(
        description=
        "Assess whether words from the same category cluster together"
        "first considering their sound patterns and then how they correlate"
        "to each other based on their contexts of occurrence")

    parser.add_argument(
        "-c",
        "--corpus",
        required=True,
        dest="corpus",
        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument(
        "-C",
        "--celex_folder",
        required=True,
        dest="celex_folder",
        help="Specify the folder where the Celex data are located.")
    parser.add_argument(
        "-O",
        "--output_folder",
        required=True,
        dest="output_folder",
        help=
        "Specify the path of the folder where the logfiles will be stored together with"
        "the summary tables.")
    parser.add_argument(
        "-M",
        "--pos_mapping",
        required=True,
        dest="pos_mapping",
        help=
        "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags."
    )
    parser.add_argument(
        "-p",
        "--precision",
        dest="precision",
        default=5,
        help=
        "Specify the number of outcomes to consider when computing discrimination's precision."
    )
    parser.add_argument(
        "-l",
        "--longitudinal",
        action="store_true",
        dest="longitudinal",
        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()

    cues = ['triphones']
    outcomes = ['tokens']
    stress_marker = [True]
    boundaries = [True]
    reduced_vowels = [False]
    distances = ['correlation']
    number_of_cues = [100, 500, 1000]
    number_of_tokens = [50, 250, 500]

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)

    summary_file = os.path.join(args.output_folder, "LDAk_summary.csv")
    # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv")

    parametrizations = it.product(cues, outcomes, stress_marker, boundaries,
                                  reduced_vowels, distances, number_of_cues,
                                  number_of_tokens)

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]
    rows = int(
        np.prod([
            len(cues),
            len(outcomes),
            len(stress_marker),
            len(reduced_vowels),
            len(boundaries),
            len(time_points),
            len(distances),
            len(number_of_cues),
            len(number_of_tokens)
        ]))
    summary_table = pd.DataFrame(index=np.arange(0, rows),
                                 columns=[
                                     "Corpus", "Cues", "Outcomes", "Stress",
                                     "Vowels", "Precision", "Time", "Distance",
                                     "numCues", "numTokens", "Phon_acc",
                                     "Phon_acc_subset", "Phon_baseline",
                                     "Distr_acc", "Distr_acc_subset",
                                     "Distr_baseline"
                                 ])

    ii = 0
    for parametrization in parametrizations:

        print(parametrization)

        cue, outcome, stress, boundary, reduced, distance, how_many_cues, how_many_tokens = parametrization

        uniphones = True if cue == 'uniphones' else False
        diphones = True if cue == 'diphones' else False
        triphones = True if cue == 'triphones' else False
        syllables = True if cue == 'syllables' else False
        vowels = 'reduced' if reduced else 'full'
        sm = "stress" if stress else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]

        encoded_corpus = corpus_encoder(args.corpus,
                                        args.celex_folder,
                                        args.pos_mapping,
                                        separator='~',
                                        stress_marker=stress,
                                        reduced=reduced,
                                        uniphones=uniphones,
                                        diphones=diphones,
                                        triphones=triphones,
                                        syllables=syllables,
                                        outcomes=outcome,
                                        boundaries=boundary)

        corpus_dir = os.path.dirname(encoded_corpus)

        a, b = [0.001, 0.001
                ] if training == 'aggregate_utterances' else [0.01, 0.01]
        file_paths = ndl(encoded_corpus,
                         alpha=a,
                         beta=b,
                         lam=1,
                         longitudinal=args.longitudinal)

        celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced)

        for idx, file_path in file_paths.items():

            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those whose jaccard coefficient
            # between true phonetic cues and most active phonetic cues for the outcome is 1
            discriminated_file = os.path.join(
                corpus_dir, '.'.join([
                    'discriminatedOutcomes',
                    str(int(idx)), ''.join(['at', args.precision]), 'json'
                ]))

            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllables,
                                                   boundaries=boundaries,
                                                   at=int(args.precision))
                json.dump(discriminated, open(discriminated_file, 'w'))
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                "The discriminated outcomes have been identified (file: %s)." %
                discriminated_file)

            accuracies = subset_experiment(matrix,
                                           discriminated,
                                           how_many_cues=how_many_cues,
                                           how_many_tokens=how_many_tokens)

            summary_table.loc[ii] = pd.Series({
                "Corpus": training,
                "Cues": cue,
                "Outcomes": outcome,
                "Stress": sm,
                "Boundaries": bound,
                "Vowels": vowels,
                "Time": int(idx),
                "Distance": distance,
                "Precision": args.precision,
                "numCues": how_many_cues,
                "numTokens": how_many_tokens,
                "Phon_acc": accuracies[0],
                "Phon_acc_subset": accuracies[1],
                "Distr_acc": accuracies[3],
                "Distr_acc_subset": accuracies[4],
                "Phon_baseline": accuracies[2],
                "Distr_baseline": accuracies[5]
            })
            ii += 1

    if os.path.exists(summary_file):
        summary_table.to_csv(summary_file,
                             sep='\t',
                             index=False,
                             mode="a",
                             header=False)
    else:
        summary_table.to_csv(summary_file, sep='\t', index=False)

예제 #6

파일 보기

파일: experiment0_lexicalDevelopment.py 프로젝트: GiovanniCassani/discriminative_learning

def main():

    parser = argparse.ArgumentParser(description="Assess whether words from the same category cluster together"
                                                 "on the basis of the sound sequences they consist of.")

    parser.add_argument("-c", "--corpus", required=True, dest="corpus",
                        help="Specify the path to the training corpus (encoded as .json).")
    parser.add_argument("-C", "--Celex_folder", required=True, dest="celex_folder",
                        help="Specify the folder where the Celex data are located.")
    parser.add_argument("-O", "--output_folder", required=True, dest="output_folder",
                        help="Specify the path of the folder where the logfiles will be stored together with"
                             "the summary tables.")
    parser.add_argument("-M", "--pos_mapping", required=True, dest="pos_mapping",
                        help="Specify the path of the file containing the mapping from CHILDES to Celex PoS tags.")
    parser.add_argument("-p", "--precision", dest="precision", default=5,
                        help="Specify the number of outcomes to consider when computing discrimination's precision.")
    parser.add_argument("-l", "--longitudinal", action="store_true", dest="longitudinal",
                        help="Specify whether to use a longitudinal design (default: False).")

    args = parser.parse_args()
    at = args.precision

    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)
    summary_file = os.path.join(args.output_folder, "lexicalDevelopment_summary.csv")

    a, b = [0.01, 0.01]
    reduced_vowels = [False]
    boundaries = [True]
    outcomes = ['tokens']
    cues = ['triphones', 'syllables']
    stress_marker = [True]

    time_points = np.linspace(10, 100, 10) if args.longitudinal else [100]
    rows = int(np.prod([len(cues), len(outcomes), len(stress_marker), len(reduced_vowels),
                        len(boundaries), len(time_points)]))
    summary_table = pd.DataFrame(index=np.arange(0, rows),
                                 columns=["Corpus", "Boundaries", "Cues", "Outcomes", "Stress", "Vowels", "Time", "At",
                                          "Discriminated", "@".join(["Precision", str(at)]), "Jaccard@1", "Total"])

    row_id = 0
    parametrizations = it.product(reduced_vowels, boundaries, outcomes, cues, stress_marker)
    for parametrization in parametrizations:

        r, boundary, outcome, cue, marker = parametrization
        uniphones = True if cue == 'uniphones' else False
        diphones = True if cue == 'diphones' else False
        triphones = True if cue == 'triphones' else False
        syllables = True if cue == 'syllables' else False
        vowels = 'reduced' if r else 'full'
        sm = "stress" if marker else 'no-stress'
        bound = 'yes' if boundary else 'no'
        training = os.path.splitext(os.path.basename(args.corpus))[0]
        celex_dict = get_celex_dictionary(args.celex_folder, reduced=r)

        encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~',
                                        stress_marker=marker, reduced=r, outcomes=outcome, boundaries=boundary,
                                        uniphones=uniphones, diphones=diphones, triphones=triphones,
                                        syllables=syllables)

        cumulative_vocabulary = get_cumulative_vocabulary(encoded_corpus, time_points)
        print()
        print("The cumulative vocabulary for the file %s has been estimated" % encoded_corpus)
        print()

        file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal)

        for idx, file_path in file_paths.items():
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the Jaccard coefficient for each outcome and select those with a coefficient of 1, meaning that the
            # model would choose all and only the correct cues when expressing an the outcome; get the number of such
            # outcomes
            print()
            jaccard_coefficients = jaccard(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=marker,
                                           uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables,
                                           boundaries=boundaries)
            jaccard_one = {}
            for token in outcomes2ids:
                if jaccard_coefficients[token] == 1:
                    jaccard_one[token] = outcomes2ids[token]
            n_jaccard = len(jaccard_one)
            print()

            # get the outcomes that are correctly discriminated given the cues they consist of: in detail, take an
            # outcome, encode it in its phonetic cues, check which outcomes are most active given such cues, check
            # whether the correct one is among the top ones (how many is indicated by the parameter 'at'; store all
            # outcomes where the correct one is among the top active ones given the cues in it
            print()
            precise = precision_at(matrix, outcomes2ids, cues2ids, celex_dict, stress_marker=marker,
                                   uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables,
                                   boundaries=boundaries, at=at)
            n_precise = len(precise)
            print()

            # repeat but only for the outcomes with a Jaccar coefficient of 1, to quantify two-way discrimination
            print()
            discriminated = precision_at(matrix, jaccard_one, cues2ids, celex_dict, stress_marker=marker,
                                         uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables,
                                         boundaries=boundaries, at=at)
            n_discriminated = len(discriminated)
            print()

            vocabulary_estimate = cumulative_vocabulary[int(idx)]

            summary_table.loc[row_id] = pd.Series({"Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm,
                                                   "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "At": at,
                                                   "Discriminated": n_discriminated, "Total": vocabulary_estimate,
                                                   "@".join(["Precision", str(at)]): n_precise, "Jaccard@1": n_jaccard})
            row_id += 1

    if os.path.exists(summary_file):
        summary_table.to_csv(summary_file, sep='\t', index=False, mode="a",
                             header=False)
    else:
        summary_table.to_csv(summary_file, sep='\t', index=False)

예제 #7

파일 보기

파일: experiment.py 프로젝트: GiovanniCassani/discriminative_learning

def tag_words(input_file,
              test_set,
              celex_dir,
              pos_mapping,
              output_folder,
              method='freq',
              evaluation='count',
              k=50,
              flush=0,
              threshold=0,
              separator='~',
              reduced=False,
              outcomes='tokens',
              boundaries=True,
              uniphones=True,
              diphones=False,
              triphones=False,
              syllable=False,
              stress_marker=False,
              alpha=0.01,
              beta=0.01,
              lam=1.0,
              longitudinal=False,
              at=5):
    """
    :param input_file:          a .json file containing transcripts of child-caregiver interactions extracted from the
                                CHILDES database. The json file consists of two lists of lists, of the same length,
                                both contain utterances but encoded differently. The first encodes each utterance as a
                                list of tokens; the second encodes each utterance as a list of lemmas and
                                Part-of-Speech tags, joined by a vertical bar ('|')
    :param test_set:            a dictionary mapping the file name to:
                                - 'test_set['filename']: the basename of the file
                                - 'test_set['items']: the set of phonological forms to be categorized, complete of the
                                   target PoS tag (phonological form and PoS tag are separated by a vertical bar ('|')
    :param celex_dir:           a string specifying the path to the Celex directory
    :param pos_mapping:         a .txt file mapping CHILDES PoS tags to CELEX tags
    :param output_folder:       the path to the folder where the logfiles will be saved
    :param method:              a string indicating the way in which the function looks at top active outcomes; two
                                options are available:
                                - 'freq' makes the function compute the distribution of PoS tags over the k top active
                                    nodes (see the explanation of the parameter k) and rank PoS tags according to their
                                    frequency among the top active cues
                                - 'sum' makes the function compute the sum of activation from all outcomes belonging to
                                    a given PoS tag within the k top active outcomes given the input cues, and rank PoS
                                     tags according to their total activation among the top active cues
    :param evaluation:          a string indicating how to compare baseline activations to item-triggered ones; two
                                options are available:
                                - 'count', simply tag the test item with the PoS tag that either was more frequent or
                                    had highest summed activation within the top active outcomes; frequency or
                                    activation are returned and can be correlated to reaction times
                                - 'distr', compare the frequency counts or summed activations generated by a specific
                                    test item to the frequency counts or summed activations at baseline and tag the
                                    test item with the PoS tag receiving highest support by the change in the
                                    distribution of frequencies or summed activations (a statistic is returned,
                                    Chi-squared for frequency distributions and t-test for summed activations, whose
                                    value can be correlated to reaction times)
    :param k:                   an integer specifying how many elements to consider from the baseline activations and
                                the activations triggered by a specific test item. By default, the top 50 outcomes are
                                considered, and compared according to the chosen combination of method and eval
    :param flush:               specify whether (and how many) top active outcome at baseline to flush away from
                                subsequent computations. It may be the case that whatever the input cues, the same high
                                frequency outcomes come out as being the most active. It may then make sense to not
                                consider them when evaluating the distribution of lexical categories over the most
                                active outcomes given an input item
    :param threshold:           the minimum activation of an outcome to be considered in the list of top activated
                                neighbors, default is 0 and shouldn't be lowered, but can be increased.
    :param separator:           the character that separates the word baseform from its PoS tag in the input corpus
    :param reduced:             a boolean specifying whether reduced phonological forms should be extracted from Celex
                                whenever possible (if set to True) or if standard phonological forms should be
                                preserved (if False)
    :param outcomes:            a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas'
    :param boundaries:          a boolean specifying whether word boundaries are to be considered when training on full
                                utterances
    :param uniphones:          a boolean indicating whether single phonemes are to be considered while encoding input
                                utterances
    :param diphones:           a boolean indicating whether sequences of two phonemes are to be considered while
                                encoding input utterances
    :param triphones:          a boolean indicating whether sequences of three phonemes are to be considered while
                                encoding input utterances
    :param syllable:            a boolean indicating whether syllables are to be considered while encoding input
                                utterances
    :param stress_marker:       a boolean indicating whether stress markers from the phonological representations of
                                Celex need to be preserved or can be discarded
    :param alpha:               a number indicating cue salience. For simplicity, we assume that every cue has the same
                                salience, so changing the value of this parameter does not affect the relative strength
                                of of cue-outcome associations but only their absolute magnitude
    :param beta:                a number indicating the learning rate for positive and negative situations. Again, we
                                make the simplifying assumption that our simulated learners are equally affected by
                                positive and negative feedback. Changing the beta value can have a significant impact
                                on the learning outcome, but 0.1 is a standard choice for this model. If the number of
                                learning trials or the number of different cues in a learning trial are very large,
                                both beta and alpha need to be lowered considerably
    :param lam:                 maximum amount of association that an outcome can receive from all the cues. It simply
                                acts as a scaling factor, so changing its value has the same effects of changing alpha
    :param longitudinal:        a boolean specifying whether to work in a longitudinal setting or not
    :param at:                  the number of top active outcomes to consider to compute precision
    :return accuracies:         a dictionary mapping the categorization accuracy on the PoS tagging experiment to each
                                time index (1 if the longitudinal parameter is set to False, 10 if it's set to True)
    :return entropies:          a dictionary mapping the normalized entropy of the distribution of the PoS tags
                                assigned by the model to each time index (1 if the longitudinal parameter is set to
                                False, 10 if it's set to True)
    :return most_frequents:     a dictionary mapping the PoS tag that was applied the most by the model to each time
                                index (1 if the longitudinal parameter is set to False, 10 if it's set to True)
    :return frequencies:        a dictionary mapping the frequency count of the most frequent PoS tag applied by the
                                model, to each time index (1 if the longitudinal parameter is set to False, 10 if it's
                                set to True)
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    encoded_corpus = corpus_encoder(input_file,
                                    celex_dir,
                                    pos_mapping,
                                    separator=separator,
                                    uniphones=uniphones,
                                    diphones=diphones,
                                    triphones=triphones,
                                    syllables=syllable,
                                    stress_marker=stress_marker,
                                    reduced=reduced,
                                    outcomes=outcomes,
                                    boundaries=boundaries)

    file_paths = ndl(encoded_corpus,
                     alpha=alpha,
                     beta=beta,
                     lam=lam,
                     longitudinal=longitudinal)

    # for each test item, compute the items from the matrix of weights that are most activated given the cues in the
    # item, get the PoS tag that is most present among the most active lexical nodes and check whether the predicted
    # PoS tag matches the gold-standard one provided along the test item. Return a global score indicating the accuracy
    # on the test set
    accuracies = {}
    entropies = {}
    most_frequents = {}
    frequencies = {}
    log_dicts = {}

    celex_dict = get_celex_dictionary(celex_dir, reduced=reduced)

    for idx, file_path in file_paths.items():

        logfile = make_log_file(input_file,
                                test_set['filename'],
                                output_folder,
                                method,
                                evaluation,
                                flush,
                                k,
                                at,
                                idx,
                                reduced=reduced,
                                uniphones=uniphones,
                                diphones=diphones,
                                triphones=triphones,
                                syllables=syllable,
                                stress_marker=stress_marker,
                                outcomes=outcomes,
                                boundaries=boundaries)

        if os.path.exists(logfile):
            print()
            print(
                "The file %s already exists, statistics for the corresponding parametrization are loaded from it"
                % logfile)
            log_dict = json.load(open(logfile, "r"))

        else:
            print()
            matrix, cues2ids, outcomes2ids = load(file_path)

            # get the column ids of all perfectly discriminated outcomes at the current time point
            # perfectly discriminated outcomes are considered to be those:
            # - whose jaccard coefficient between true phonetic cues and most active phonetic cued for the outcome is 1
            # - and that appear in the top active outcomes given the cues they consist of
            corpus_folder = os.path.dirname(encoded_corpus)
            discriminated_file = os.path.join(
                corpus_folder,
                '.'.join(['discriminatedOutcomes',
                          str(int(idx)), 'json']))
            if not os.path.exists(discriminated_file):
                discriminated = find_discriminated(matrix,
                                                   cues2ids,
                                                   outcomes2ids,
                                                   celex_dict,
                                                   stress_marker=stress_marker,
                                                   uniphones=uniphones,
                                                   diphones=diphones,
                                                   triphones=triphones,
                                                   syllables=syllable,
                                                   boundaries=boundaries,
                                                   at=at)
            else:
                discriminated = json.load(open(discriminated_file, 'r'))

            print()
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Start test phase, using %s as weight matrix and %s as test set..."
                % (os.path.basename(file_path),
                   os.path.basename(test_set['filename'])))

            log_dict = categorize(test_set['items'],
                                  matrix,
                                  cues2ids,
                                  discriminated,
                                  method=method,
                                  evaluation=evaluation,
                                  flush=flush,
                                  k=k,
                                  threshold=threshold,
                                  stress_marker=stress_marker,
                                  syllables=syllable,
                                  uniphones=uniphones,
                                  diphones=diphones,
                                  triphones=triphones,
                                  boundaries=boundaries)
            json.dump(log_dict, open(logfile, 'w'))

            print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.")

        f1, h, pos, freq = compute_summary_statistics(log_dict)

        accuracies[idx] = f1
        entropies[idx] = h
        most_frequents[idx] = pos
        frequencies[idx] = freq
        log_dicts[idx] = log_dict

        print("Accuracy: %0.5f" % f1)
        print()

    return log_dicts, accuracies, entropies, most_frequents, frequencies