def main(): parser = argparse.ArgumentParser( description= "Compute the variance of each phonological cue and token, as a proxy " "to identify the amount of information they carry") parser.add_argument( "-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument( "-C", "--celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument( "-O", "--output_folder", required=True, dest="output_folder", help= "Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument( "-M", "--pos_mapping", required=True, dest="pos_mapping", help= "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags." ) parser.add_argument( "-p", "--precision", dest="precision", default=5, help= "Specify the number of outcomes to consider when computing discrimination's precision." ) parser.add_argument( "-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() cues = ['triphones'] outcomes = ['tokens'] stress_marker = [True] boundaries = [True] reduced_vowels = [False] number_of_cues = [100, 1000] if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) cues_variance_file = os.path.join(args.output_folder, "words_cues_variance.csv") tokens_variance_file = os.path.join(args.output_folder, "words_tokens_variance.csv") parametrizations = it.product(cues, outcomes, stress_marker, boundaries, reduced_vowels) time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] cues_table = pd.DataFrame( index=[], columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "Phonological_cue", "Variance", "Frequency", "Lexical_diversity", "Phonological_diversity", "Cue|Cues_predictability", "Cue|Tokens_predictability", "Cues|Cue_predictability", "Tokens|Cue_predictability" ]) tokens_table = pd.DataFrame( index=[], columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "numCues", 'Token', "Variance", "Frequency", "Lexical_diversity", "Phonological_diversity", "Token|Tokens_predictability", "Token|Cues_predictability", "Tokens|Token_predictability", "Cues|Token_predictability" ]) ii = 0 jj = 0 for parametrization in parametrizations: print(parametrization) cue_type, outcome, stress, boundary, reduced = parametrization uniphones = True if cue_type == 'uniphones' else False diphones = True if cue_type == 'diphones' else False triphones = True if cue_type == 'triphones' else False syllables = True if cue_type == 'syllables' else False vowels = 'reduced' if reduced else 'full' sm = "stress" if stress else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=stress, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcome, boundaries=boundary) print( strftime("%Y-%m-%d %H:%M:%S") + ": Started computing distributional statistics from the corpus...") token_statistics, cue_statistics = usf.compute_distributional_predictors( encoded_corpus, time_points) print( strftime("%Y-%m-%d %H:%M:%S") + ": ...finished computing distributional statistics from the corpus." ) print() corpus_dir = os.path.dirname(encoded_corpus) a, b = [0.001, 0.001 ] if training == 'aggregate_utterances' else [0.01, 0.01] file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced) for idx, file_path in file_paths.items(): idx = int(idx) matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join([ 'discriminatedOutcomes', str(idx), ''.join(['at', args.precision]), 'json' ])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=int(args.precision)) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": The discriminated outcomes have been identified (file: %s)." % discriminated_file) print() row_variances, matrix, discriminated = usf.get_cue_variances( matrix, discriminated) cue_variances = {} for cue in cues2ids: cue_idx = cues2ids[cue] cue_variances[cue] = row_variances[cue_idx] print( strftime("%Y-%m-%d %H:%M:%S") + ": Started storing cue variances...") for cue in cue_variances: if len(cue_statistics[cue]['freq']) == 10: frequency = cue_statistics[cue]['freq'][idx] lexical_diversity = cue_statistics[cue]['lexdiv'][idx] phonological_diversity = cue_statistics[cue]['phondiv'][ idx] # average conditional probability of a cue given the co-occurring cues cue_cues_predictability = cue_statistics[cue][ 'p_cue_cues'][idx] # average predictive power of a cue with respect to all the co-occurring cues cues_cue_predictability = cue_statistics[cue][ 'p_cues_cue'][idx] # average conditional probability of a cue given the co-occurring tokens cue_tokens_predictability = cue_statistics[cue][ 'p_cue_tokens'][idx] # average predictive power of a cue with respect to all the co-occurring tokens tokens_cue_predictability = cue_statistics[cue][ 'p_tokens_cue'][idx] cues_table.loc[ii] = pd.Series({ "Corpus": training, "Cues": cue_type, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": idx, "Precision": int(args.precision), "Phonological_cue": cue, "Variance": cue_variances[cue], "Frequency": frequency, "Lexical_diversity": lexical_diversity, "Phonological_diversity": phonological_diversity, "Cue|Cues_predictability": cue_cues_predictability, "Cues|Cue_predictability": cues_cue_predictability, "Cue|Tokens_predictability": cue_tokens_predictability, "Tokens|Cue_predictability": tokens_cue_predictability }) ii += 1 print( strftime("%Y-%m-%d %H:%M:%S") + ": ...finished storing cue variances.") print() for how_many_cues in number_of_cues: print("Number of cues: ", how_many_cues) token_variances = usf.get_token_variances( matrix, discriminated, row_variances, how_many_cues=how_many_cues) print( strftime("%Y-%m-%d %H:%M:%S") + ": Started storing token variances...") for token in token_variances: if len(token_statistics[token]['freq']) == 10: frequency = token_statistics[token]['freq'][idx] lexical_diversity = token_statistics[token]['lexdiv'][ idx] phonological_diversity = token_statistics[token][ 'phondiv'][idx] # average conditional probability of a token given the co-occurring tokens token_tokens_predictability = token_statistics[token][ 'p_token_tokens'][idx] # average predictive power of a token with respect to the co-occurring tokens tokens_token_predictability = token_statistics[token][ 'p_tokens_token'][idx] # average conditional probability of a token given the co-occurring phonological cues token_cues_predictability = token_statistics[token][ 'p_token_cues'][idx] # average predictive power of a token with respect to the co-occurring phonological cues cues_token_predictability = token_statistics[token][ 'p_cues_token'][idx] tokens_table.loc[jj] = pd.Series({ "Corpus": training, "Cues": cue_type, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": idx, "numCues": how_many_cues, "Precision": int(args.precision), "Token": token, "Variance": token_variances[token], "Frequency": frequency, "Lexical_diversity": lexical_diversity, "Phonological_diversity": phonological_diversity, "Token|Tokens_predictability": token_tokens_predictability, "Tokens|Token_predictability": tokens_token_predictability, "Token|Cues_predictability": token_cues_predictability, "Cues|Token_predictability": cues_token_predictability }) jj += 1 print( strftime("%Y-%m-%d %H:%M:%S") + ": ...finished storing token variances.") print() print('-' * 100) print() print() print() print('=' * 100) print('=' * 100) print() print() print() print() print() print('#' * 100) print('#' * 100) print('#' * 100) print() print() print() if os.path.exists(cues_variance_file): cues_table.to_csv(cues_variance_file, sep='\t', index=False, mode="a", header=False) else: cues_table.to_csv(cues_variance_file, sep='\t', index=False) if os.path.exists(tokens_variance_file): tokens_table.to_csv(tokens_variance_file, sep='\t', index=False, mode="a", header=False) else: tokens_table.to_csv(tokens_variance_file, sep='\t', index=False)
def main(): parser = argparse.ArgumentParser( description= "Assess whether words from the same category cluster together" "first considering their sound patterns and then how they correlate" "to each other based on their contexts of occurrence") parser.add_argument( "-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument( "-C", "--celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument( "-O", "--output_folder", required=True, dest="output_folder", help= "Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument( "-M", "--pos_mapping", required=True, dest="pos_mapping", help= "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags." ) parser.add_argument( "-p", "--precision", dest="precision", default=5, help= "Specify the number of outcomes to consider when computing discrimination's precision." ) parser.add_argument( "--cue_threshold", dest="cue_threshold", default='high', help= "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant cues or" "a low (i.e. lax) one.") parser.add_argument( "--token_threshold", dest="token_threshold", default='low', help= "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant tokens or" "a low (i.e. lax) one.") parser.add_argument( "-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() # thresholds have been determined manually according to the following criteria: # - high thresholds were set to yield around 100 dimensions at time t100 # - low thresholds were set to yield around 100 dimensions at time t10 # Whenever a threshold value didn't yield any dimension because the value was too stringent, the threshold was # lowered until at least 1 dimension was available at all time points. First, I adjusted the threshold on # phonological cues' variance, then on tokens' variance. Practically, it was always possible to set thresholds to # yield around 100 dimensions at the specified time points, except for tokens' variance in the low variance setting, # wheere the threshold yielding around 100 dimensions at t10 quickly left the model without any dimension. # Therefore, these models start with considerably high dimensionalities, and finishes with almost no dimensionans. thresholds = { 'aggregate_utterances_at5_c_low': 0.00000001, # 99 cues at t10 (799 at t100) 'aggregate_utterances_at5_c_high': 0.00000075, # 105 cues at t100 (2 cues at t10) 'aggregate_utterances_at5_t_low': 0.02, # 741 tokens at t10 (3 tokens at t100) 'aggregate_utterances_at5_t_high': 0.04, # 125 tokens at 100 (843 tokens at t10) 'aggregate_utterances_at25_c_low': 0.000000005, # 98 cues at t10 (802 cues at t100) 'aggregate_utterances_at25_c_high': 0.00000025, # 156 cues at t100 (2 cues at t10) 'aggregate_utterances_at25_t_low': 0.015, # 1632 tokens at t10 (19 tokens at t100) 'aggregate_utterances_at25_t_high': 0.033, # 140 tokens at t100 (1731 at t10) 'aggregate_words_at5_c_low': 0.000025, # 101 cues at t10 (318 cues at t100) 'aggregate_words_at5_c_high': 0.00005, # 131 cues at t100 (40 cues at t10) 'aggregate_words_at5_t_low': 0.0325, # 834 tokens at t10 (3 tokens at t100) 'aggregate_words_at5_t_high': 0.07, # 95 tokens at t100 (1004 tokens at t10) 'aggregate_words_at25_c_low': 0.0000075, # 117 cues at t10 (416 cues at t100) 'aggregate_words_at25_c_high': 0.00002, # 124 cues at t100 (33 cues at t10) 'aggregate_words_at25_t_low': 0.0295, # 1850 tokens at t10 (7 tokens at t100) 'aggregate_words_at25_t_high': 0.085 # 113 tokens at t100 (2581) } cues = ['triphones'] outcomes = ['tokens'] stress_marker = [True] boundaries = [True] reduced_vowels = [False] distances = ['correlation'] if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) summary_file = os.path.join(args.output_folder, "LDAt_summary.csv") # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv") parametrizations = it.product(cues, outcomes, stress_marker, boundaries, reduced_vowels, distances) time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] rows = int( np.prod([ len(cues), len(outcomes), len(stress_marker), len(reduced_vowels), len(boundaries), len(time_points), len(distances) ])) summary_table = pd.DataFrame( index=np.arange(0, rows), columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "Distance", "tCues", "numCues", "tTokens", "numTokens", "Phon_acc", "Phon_acc_subset", "Phon_baseline", "Distr_acc", "Distr_acc_subset", "Distr_baseline" ]) ii = 0 for parametrization in parametrizations: cue, outcome, stress, boundary, reduced, distance = parametrization uniphones = True if cue == 'uniphones' else False diphones = True if cue == 'diphones' else False triphones = True if cue == 'triphones' else False syllables = True if cue == 'syllables' else False vowels = 'reduced' if reduced else 'full' sm = "stress" if stress else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=stress, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcome, boundaries=boundary) corpus_dir = os.path.dirname(encoded_corpus) # precision at 25 a, b = [0.001, 0.001 ] if training == 'aggregate_utterances' else [0.01, 0.01] c = thresholds['_'.join([ training, ''.join(['at', args.precision]), 'c', args.cue_threshold ])] t = thresholds['_'.join([ training, ''.join(['at', args.precision]), 't', args.token_threshold ])] file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced) for idx, file_path in sorted(file_paths.items(), key=operator.itemgetter(0)): matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join([ 'discriminatedOutcomes', str(int(idx)), ''.join(['at', args.precision]), 'json' ])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=args.precision) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( "The discriminated outcomes have been identified (file: %s)." % discriminated_file) accuracies = threshold_experiment(matrix, discriminated, cues_threshold=c, tokens_threshold=t) summary_table.loc[ii] = pd.Series({ "Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "Distance": distance, "Precision": args.precision, "tCues": args.cue_threshold, "numCues": accuracies[3], "tTokens": args.token_threshold, "numTokens": accuracies[7], "Phon_acc": accuracies[0], "Phon_acc_subset": accuracies[1], "Distr_acc": accuracies[4], "Distr_acc_subset": accuracies[5], "Phon_baseline": accuracies[2], "Distr_baseline": accuracies[6] }) ii += 1 if os.path.exists(summary_file): summary_table.to_csv(summary_file, sep='\t', index=False, mode="a", header=False) else: summary_table.to_csv(summary_file, sep='\t', index=False)
def main(): parser = argparse.ArgumentParser( description= "Assess whether words from the same category cluster together" "first considering their sound patterns and then how they correlate" "to each other based on their contexts of occurrence") parser.add_argument( "-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument( "-C", "--celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument( "-O", "--output_folder", required=True, dest="output_folder", help= "Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument( "-M", "--pos_mapping", required=True, dest="pos_mapping", help= "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags." ) parser.add_argument( "-p", "--precision", dest="precision", default=5, help= "Specify the number of outcomes to consider when computing discrimination's precision." ) parser.add_argument( "-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() cues = ['triphones'] outcomes = ['tokens'] stress_marker = [True] boundaries = [True] reduced_vowels = [False] distances = ['correlation'] number_of_cues = [100, 500, 1000] number_of_tokens = [50, 250, 500] if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) summary_file = os.path.join(args.output_folder, "LDAk_summary.csv") # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv") parametrizations = it.product(cues, outcomes, stress_marker, boundaries, reduced_vowels, distances, number_of_cues, number_of_tokens) time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] rows = int( np.prod([ len(cues), len(outcomes), len(stress_marker), len(reduced_vowels), len(boundaries), len(time_points), len(distances), len(number_of_cues), len(number_of_tokens) ])) summary_table = pd.DataFrame(index=np.arange(0, rows), columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "Distance", "numCues", "numTokens", "Phon_acc", "Phon_acc_subset", "Phon_baseline", "Distr_acc", "Distr_acc_subset", "Distr_baseline" ]) ii = 0 for parametrization in parametrizations: print(parametrization) cue, outcome, stress, boundary, reduced, distance, how_many_cues, how_many_tokens = parametrization uniphones = True if cue == 'uniphones' else False diphones = True if cue == 'diphones' else False triphones = True if cue == 'triphones' else False syllables = True if cue == 'syllables' else False vowels = 'reduced' if reduced else 'full' sm = "stress" if stress else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=stress, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcome, boundaries=boundary) corpus_dir = os.path.dirname(encoded_corpus) a, b = [0.001, 0.001 ] if training == 'aggregate_utterances' else [0.01, 0.01] file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced) for idx, file_path in file_paths.items(): matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join([ 'discriminatedOutcomes', str(int(idx)), ''.join(['at', args.precision]), 'json' ])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=int(args.precision)) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( "The discriminated outcomes have been identified (file: %s)." % discriminated_file) accuracies = subset_experiment(matrix, discriminated, how_many_cues=how_many_cues, how_many_tokens=how_many_tokens) summary_table.loc[ii] = pd.Series({ "Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "Distance": distance, "Precision": args.precision, "numCues": how_many_cues, "numTokens": how_many_tokens, "Phon_acc": accuracies[0], "Phon_acc_subset": accuracies[1], "Distr_acc": accuracies[3], "Distr_acc_subset": accuracies[4], "Phon_baseline": accuracies[2], "Distr_baseline": accuracies[5] }) ii += 1 if os.path.exists(summary_file): summary_table.to_csv(summary_file, sep='\t', index=False, mode="a", header=False) else: summary_table.to_csv(summary_file, sep='\t', index=False)
def cluster_words(corpus, output_folder, celex_folder, pos_mapping, distance='cosine', reduced=False, outcomes='tokens', uniphones=False, diphones=False, triphones=True, syllables=False, stress_marker=True, boundaries=True, at=5, nn=25, a=0.01, b=0.01, longitudinal=False): """ :param corpus: the corpus to be used for training the model :param output_folder: the folder where the logfile of the clustering experiment will be saved :param celex_folder: the folder containing the data from the Celex database :param pos_mapping: the path to the file mapping CHILDES pos tags to Celex tags :param distance: a string (either 'correlation' or 'cosine' indicating which distance metric to use :param reduced: a boolean indicating whether to use reduced or full phonetic transcriptions from Celex :param outcomes: a string (either 'tokens' or 'lemmas') indicating which outcomes to consider for learning :param uniphones: a boolean indicating whether to consider uniphones as cues :param diphones: a boolean indicating whether to consider diphones as cues :param triphones: a boolean indicating whether to consider triphones as cues :param syllables: a boolean indicating whether to consider syllables as cues :param stress_marker: a boolean indicating whether to consider or discard stress information :param boundaries: a boolean indicating whether to consider or discard word boundaries :param at: an integer indicating how many outcomes to compute to compute discrimination's precision :param nn: an integer indicating how many nearest neighbors to consider when evaluating clustering :param a: the alpha parameter from the Rescorla Wagner model :param b: the beta parameter from the Rescorla Wagner model :param longitudinal: a boolean indicating whether to adopt a longitudinal design or not :return accuracies: a dictionary mapping time indices to the clustering accuracy obtained at that time point """ if not os.path.exists(output_folder): os.makedirs(output_folder) plot_folder = os.path.join(output_folder, 'plots') if not os.path.exists(plot_folder): os.makedirs(plot_folder) encoded_corpus = corpus_encoder(corpus, celex_folder, pos_mapping, separator='~', stress_marker=stress_marker, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcomes, boundaries=boundaries) corpus_dir = os.path.dirname(encoded_corpus) file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=longitudinal) celex_dict = get_celex_dictionary(celex_folder, reduced=reduced) accuracies = {} for idx, file_path in file_paths.items(): logfile = make_log_file(corpus, output_folder, 'json', dist=distance, nn=nn, at=at, time=idx, outcomes=outcomes, reduced=reduced, stress_marker=stress_marker, boundaries=boundaries, syllables=syllables, uniphones=uniphones, diphones=diphones, triphones=triphones) plotfile = make_log_file(corpus, plot_folder, 'pdf', dist=distance, nn=nn, at=at, time=idx, outcomes=outcomes, reduced=reduced, stress_marker=stress_marker, boundaries=boundaries, syllables=syllables, uniphones=uniphones, diphones=diphones, triphones=triphones) if os.path.exists(logfile): print() print( "The file %s already exists, statistics for the corresponding " "parametrization are loaded from it" % logfile) clusters = json.load(open(logfile, "r")) else: print() matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join(['discriminatedOutcomes', str(int(idx)), 'json'])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=at) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Start clustering, using %s as weight matrix..." % (os.path.basename(file_path))) if distance == 'cosine': similarities, discriminated = sim.pairwise_cos( matrix, discriminated, plot_path=plotfile) else: similarities, discriminated = sim.pairwise_corr( matrix, discriminated, plot_path=plotfile) df = sim.sim2df(similarities, discriminated) similarities_file = os.path.join( corpus_dir, '.'.join(['similarities', distance, str(int(idx)), 'csv'])) df.to_csv(similarities_file, sep='\t') clusters = sim.neighborhood(discriminated, similarities, nn=nn) json.dump(clusters, open(logfile, 'w')) print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.") accuracy, baseline_acc, h, baseline_h = sim.clustering_precision( clusters) accuracies[idx] = { 'accuracy': accuracy, 'baseline_acc': baseline_acc, 'entropy': h, 'baseline_entr': baseline_h } return accuracies
def tag_words(input_file, test_set, celex_dir, pos_mapping, output_folder, method='freq', evaluation='count', k=50, flush=0, threshold=0, separator='~', reduced=False, outcomes='tokens', boundaries=True, uniphones=True, diphones=False, triphones=False, syllable=False, stress_marker=False, alpha=0.01, beta=0.01, lam=1.0, longitudinal=False, at=5): """ :param input_file: a .json file containing transcripts of child-caregiver interactions extracted from the CHILDES database. The json file consists of two lists of lists, of the same length, both contain utterances but encoded differently. The first encodes each utterance as a list of tokens; the second encodes each utterance as a list of lemmas and Part-of-Speech tags, joined by a vertical bar ('|') :param test_set: a dictionary mapping the file name to: - 'test_set['filename']: the basename of the file - 'test_set['items']: the set of phonological forms to be categorized, complete of the target PoS tag (phonological form and PoS tag are separated by a vertical bar ('|') :param celex_dir: a string specifying the path to the Celex directory :param pos_mapping: a .txt file mapping CHILDES PoS tags to CELEX tags :param output_folder: the path to the folder where the logfiles will be saved :param method: a string indicating the way in which the function looks at top active outcomes; two options are available: - 'freq' makes the function compute the distribution of PoS tags over the k top active nodes (see the explanation of the parameter k) and rank PoS tags according to their frequency among the top active cues - 'sum' makes the function compute the sum of activation from all outcomes belonging to a given PoS tag within the k top active outcomes given the input cues, and rank PoS tags according to their total activation among the top active cues :param evaluation: a string indicating how to compare baseline activations to item-triggered ones; two options are available: - 'count', simply tag the test item with the PoS tag that either was more frequent or had highest summed activation within the top active outcomes; frequency or activation are returned and can be correlated to reaction times - 'distr', compare the frequency counts or summed activations generated by a specific test item to the frequency counts or summed activations at baseline and tag the test item with the PoS tag receiving highest support by the change in the distribution of frequencies or summed activations (a statistic is returned, Chi-squared for frequency distributions and t-test for summed activations, whose value can be correlated to reaction times) :param k: an integer specifying how many elements to consider from the baseline activations and the activations triggered by a specific test item. By default, the top 50 outcomes are considered, and compared according to the chosen combination of method and eval :param flush: specify whether (and how many) top active outcome at baseline to flush away from subsequent computations. It may be the case that whatever the input cues, the same high frequency outcomes come out as being the most active. It may then make sense to not consider them when evaluating the distribution of lexical categories over the most active outcomes given an input item :param threshold: the minimum activation of an outcome to be considered in the list of top activated neighbors, default is 0 and shouldn't be lowered, but can be increased. :param separator: the character that separates the word baseform from its PoS tag in the input corpus :param reduced: a boolean specifying whether reduced phonological forms should be extracted from Celex whenever possible (if set to True) or if standard phonological forms should be preserved (if False) :param outcomes: a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas' :param boundaries: a boolean specifying whether word boundaries are to be considered when training on full utterances :param uniphones: a boolean indicating whether single phonemes are to be considered while encoding input utterances :param diphones: a boolean indicating whether sequences of two phonemes are to be considered while encoding input utterances :param triphones: a boolean indicating whether sequences of three phonemes are to be considered while encoding input utterances :param syllable: a boolean indicating whether syllables are to be considered while encoding input utterances :param stress_marker: a boolean indicating whether stress markers from the phonological representations of Celex need to be preserved or can be discarded :param alpha: a number indicating cue salience. For simplicity, we assume that every cue has the same salience, so changing the value of this parameter does not affect the relative strength of of cue-outcome associations but only their absolute magnitude :param beta: a number indicating the learning rate for positive and negative situations. Again, we make the simplifying assumption that our simulated learners are equally affected by positive and negative feedback. Changing the beta value can have a significant impact on the learning outcome, but 0.1 is a standard choice for this model. If the number of learning trials or the number of different cues in a learning trial are very large, both beta and alpha need to be lowered considerably :param lam: maximum amount of association that an outcome can receive from all the cues. It simply acts as a scaling factor, so changing its value has the same effects of changing alpha :param longitudinal: a boolean specifying whether to work in a longitudinal setting or not :param at: the number of top active outcomes to consider to compute precision :return accuracies: a dictionary mapping the categorization accuracy on the PoS tagging experiment to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) :return entropies: a dictionary mapping the normalized entropy of the distribution of the PoS tags assigned by the model to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) :return most_frequents: a dictionary mapping the PoS tag that was applied the most by the model to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) :return frequencies: a dictionary mapping the frequency count of the most frequent PoS tag applied by the model, to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) """ if not os.path.exists(output_folder): os.makedirs(output_folder) encoded_corpus = corpus_encoder(input_file, celex_dir, pos_mapping, separator=separator, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllable, stress_marker=stress_marker, reduced=reduced, outcomes=outcomes, boundaries=boundaries) file_paths = ndl(encoded_corpus, alpha=alpha, beta=beta, lam=lam, longitudinal=longitudinal) # for each test item, compute the items from the matrix of weights that are most activated given the cues in the # item, get the PoS tag that is most present among the most active lexical nodes and check whether the predicted # PoS tag matches the gold-standard one provided along the test item. Return a global score indicating the accuracy # on the test set accuracies = {} entropies = {} most_frequents = {} frequencies = {} log_dicts = {} celex_dict = get_celex_dictionary(celex_dir, reduced=reduced) for idx, file_path in file_paths.items(): logfile = make_log_file(input_file, test_set['filename'], output_folder, method, evaluation, flush, k, at, idx, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllable, stress_marker=stress_marker, outcomes=outcomes, boundaries=boundaries) if os.path.exists(logfile): print() print( "The file %s already exists, statistics for the corresponding parametrization are loaded from it" % logfile) log_dict = json.load(open(logfile, "r")) else: print() matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those: # - whose jaccard coefficient between true phonetic cues and most active phonetic cued for the outcome is 1 # - and that appear in the top active outcomes given the cues they consist of corpus_folder = os.path.dirname(encoded_corpus) discriminated_file = os.path.join( corpus_folder, '.'.join(['discriminatedOutcomes', str(int(idx)), 'json'])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllable, boundaries=boundaries, at=at) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Start test phase, using %s as weight matrix and %s as test set..." % (os.path.basename(file_path), os.path.basename(test_set['filename']))) log_dict = categorize(test_set['items'], matrix, cues2ids, discriminated, method=method, evaluation=evaluation, flush=flush, k=k, threshold=threshold, stress_marker=stress_marker, syllables=syllable, uniphones=uniphones, diphones=diphones, triphones=triphones, boundaries=boundaries) json.dump(log_dict, open(logfile, 'w')) print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.") f1, h, pos, freq = compute_summary_statistics(log_dict) accuracies[idx] = f1 entropies[idx] = h most_frequents[idx] = pos frequencies[idx] = freq log_dicts[idx] = log_dict print("Accuracy: %0.5f" % f1) print() return log_dicts, accuracies, entropies, most_frequents, frequencies