def main(): parser = argparse.ArgumentParser( description='Process arguments to create Celex dictionary.') parser.add_argument( '-C', '--Celex_dir', required=True, dest='celex_dir', help='Specify the directory containing the CELEX files.') parser.add_argument( '-r', '--reduced', action='store_true', dest='reduced', help='Specify if the function considers reduced phonological forms.') args = parser.parse_args() if not os.path.isdir(args.celex_dir): raise ValueError( "The folder you provided does not exist. Please, provide the path to an existing folder." ) else: get_celex_dictionary(args.celex_dir, reduced=args.reduced)
def get_cue_and_outcome_measures(associations, row_ids, col_ids, celex_dir, plot_path, uniphones, diphones, triphones, syllable, stress_marker, reduced=False): """ :param associations: :param row_ids: :param col_ids: :param celex_dir: :param plot_path: :param uniphones: :param diphones: :param triphones: :param syllable: :param stress_marker: :return: """ celex_dict = get_celex_dictionary(celex_dir, reduced) jaccard_values = jaccard(associations, row_ids, col_ids, celex_dict, plots_folder=plot_path, stress_marker=stress_marker, uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllable) outcome_values = outcome_measures(associations, col_ids, plot_path) cue_values = cue_measures(associations, row_ids, col_ids, plot_path) return cue_values, outcome_values, jaccard_values
def make_test_set(corpus_file, celex_dir, pos_mapping, output_file, ambiguous=False, new=False, reduced=False, stress_marker=False): """ :param corpus_file: the path to a .json file consisting of two lists, which in turn consists of several lists. Each inner list contains strings :param celex_dir: the path to the Celex directory :param pos_mapping: a .txt file mapping CHILDES PoS tags to CELEX tags :param output_file: the file where the test items will be printed :param ambiguous: if True, words are returned that are tagged with multiple PoS tags in CELEX and all have the same pronunciation :param new: if True, words are returned that are listed in the CELEX dictionary but never appear in the input corpus :param reduced: a boolean specifying whether reduced phonological forms should be extracted from Celex whenever possible (if set to True) or if standard phonological forms should be preserved (if False) :param stress_marker: a boolean indicating whether stress information from the test items should be preserved or not. It is assumed that test items are all encoded for stress: setting this argument to False causes the algorithm to discard the stress information. Secondary stress, marked with ("), is always deleted. It is assumed that stress is encoded as (') :return test_set: a set containing tuples (orthographic form, phonological form): orthographic forms - and corresponding phonological forms - are added based on the input arguments ambiguous and new """ corpus = json.load(open(corpus_file, 'r+')) celex_dict = get_celex_dictionary(celex_dir, reduced=reduced) pos_tags = get_pos_mapping(pos_mapping) corpus_words = get_words_from_corpus(corpus) filtered_words = align_tag_set(corpus_words, pos_tags) celex_lemmas = lemmas2phon(celex_dict) lemmas_with_pos = {'|'.join([k, p]) for k in celex_lemmas for p in celex_lemmas[k]} if new: # only look at words that are in CELEX but not in the corpus, without considering PoS tags candidates = lemmas_with_pos - (lemmas_with_pos.intersection(filtered_words)) else: # only look at words that are both in CELEX and in the corpus, without considering PoS tags candidates = lemmas_with_pos.intersection(filtered_words) ambiguous_words = set() unambiguous_words = set() candidates_copy = {el.split('|')[0] for el in candidates} total_words = len(candidates_copy) - 1 check_points = {int(np.floor(total_words / 100 * n)): n for n in np.linspace(20, 100, 5)} for idx, word in enumerate(candidates_copy): pos_tags = set(celex_lemmas[word].keys()) if len(pos_tags) > 1: phonetic_forms = defaultdict(set) for tag in pos_tags: # when the same orthographic form corresponds to two different phonetic forms (that differ depending on # the PoS tag), this dictionary has multiple keys, each paired to the PoS tag matching the phonetic form phonetic_forms[celex_lemmas[word][tag]].add(tag) if len(set(phonetic_forms.keys())) == 1: # if the set of phonetic forms has cardinality 1, then the word is pronounced the same regardless of the # PoS tag; add the word with all its PoS tags to the set of ambiguous words for tag in pos_tags: test_item = derive_orthography_and_phonology(word, tag, celex_lemmas) ambiguous_words.add(test_item) else: # if the cardinality the set of phonetic forms is not 1, than there are pronunciation differences for # the same orthographic form depending on the PoS tag; if this is the case, find possible pairs that are # ambiguous and the forms that are unambiguous, and add each to the test set according to the parameter # choice if len(pos_tags) == len(set(phonetic_forms.keys())): # if there are as many different phonetic forms as there are PoS tags, it means that there is no # ambiguity once the PoS tag is considered and that the phonetic form of a words maps uniquely to # a given combination of orthographic form and PoS tag; thus, add each word-PoS combination together # with the corresponding phonological form to the set of unambiguous words for tag in pos_tags: test_item = derive_orthography_and_phonology(word, tag, celex_lemmas) unambiguous_words.add(test_item) else: # if there are fewer phonetic forms than PoS tags, than there is some ambiguity, meaning that two or # more combinations of orthographic form and PoS tag are pronounced the same, making them # indistinguishable from a purely phonetic perspective. There might be unambiguous forms, but it is # not guaranteed as there could be pairs of ambiguous orthographic forms: phonetic forms that pair # to more than one PoS tag are ambiguous, those that pair to a single PoS tag are not. for form, pos_tags in phonetic_forms.items(): if len(pos_tags) == 1: test_item = derive_orthography_and_phonology(word, list(pos_tags)[0], celex_lemmas) unambiguous_words.add(test_item) else: for tag in pos_tags: test_item = derive_orthography_and_phonology(word, tag, celex_lemmas) ambiguous_words.add(test_item) else: test_item = derive_orthography_and_phonology(word, list(pos_tags)[0], celex_lemmas) unambiguous_words.add(test_item) if idx in check_points: print(strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of target words have been evaluated." % check_points[idx]) test_set = set() words_to_evaluate = ambiguous_words if ambiguous else unambiguous_words print() print(strftime("%Y-%m-%d %H:%M:%S") + ": Started writing to file...") with open(output_file, "a+") as o: for word in words_to_evaluate: if word[0] in candidates: phon = word[1].replace("\"", "") if not stress_marker: phon.replace("'", "") test_set.add(phon) o.write("\t".join([word[0], phon])) o.write("\n") print(strftime("%Y-%m-%d %H:%M:%S") + ": ...finished writing to file.") return test_set
def corpus_encoder(corpus_name, celex_dir, pos_mapping, separator='~', reduced=True, outcomes='tokens', uniphones=True, diphones=False, triphones=False, syllables=False, stress_marker=False, boundaries=False): """ :param corpus_name: the path to a .json file containing transcripts of child-caregiver interactions extracted from the CHILDES database. The json file consists of two lists of lists, of the same length, both contain words but encoded differently. The first is a list of tokens, i.e. surface forms as they appear in the transcriptions; the second is a list of lemmas, i.e. words in their dictionary form, without any inflectional morphology, together with their Part-of-Speech tags, joined by a specific character (which can be specified with the parameter 'separator'. :param celex_dir: a string indicating the path to the Celex directory containing files with phonological and morphological information for words and lemmas ('epw.cd', 'epl.cd', 'emw.cd', eml.cd'). The function also checks whether this directory already contains the Celex dictionary: if it is found, the dictionary is loaded and the function proceeds, otherwise the dictionary is created. :param pos_mapping: the path to a .txt file indicating the mapping between CHILDES and Celex PoS tags; it must consist of two space-separated columns. :param separator: the character that separates the word baseform from its PoS tag in the input corpus :param reduced: a boolean specifying whether reduced phonological forms should be extracted from Celex whenever possible (if set to True) or if standard phonological forms should be preserved (if False) :param outcomes: a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas' :param uniphones: a boolean indicating whether single phonemes are to be considered while encoding input utterances :param diphones: a boolean indicating whether sequences of two phonemes are to be considered while encoding input utterances :param triphones: a boolean indicating whether sequences of three phonemes are to be considered while encoding input utterances :param syllables: a boolean indicating whether syllables are to be considered while encoding input utterances :param stress_marker: a boolean indicating whether stress markers from the phonological representations of Celex need to be preserved or can be discarded :param boundaries: a boolean indicating whether to preserve or discard word boundaries :return out_file: the path to the file where the encoded version of the input file has been printed This function runs in linear time on the length of the input (if it takes 1 minute to process 1k utterances, it takes 2 minutes to process 2k utterances). It processes ~550k utterances in ~10 second on a 2x Intel Xeon 6-Core E5-2603v3 with 2x6 cores and 2x128 Gb of RAM. """ # get the Celex dictionary; create a dictionary where token surface forms are keys, and values # are sets containing all the token IDs that match a given surface form; the get the vowel symbols from Celex and # the mapping from inflectional morphology codes to their meanings celex_dict = get_celex_dictionary(celex_dir, reduced=reduced) tokens2identifiers = tokens2ids(celex_dict) pos_dict = get_pos_mapping(pos_mapping) # use the path of the input file to generate the path of the output file, adding encoding information to the # input filename; print to standard output a summary of all the encoding parameters input_filename, extension = os.path.splitext(corpus_name) encoding_string = encoding_features(corpus_name, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, stress_marker=stress_marker, outcomes=outcomes, boundaries=boundaries) output_folder = "_".join([input_filename, encoding_string]) if not os.path.exists(output_folder): os.makedirs(output_folder) output_file = os.path.join( output_folder, ".".join([output_folder.split('/')[-1], 'json'])) # check whether the output file corresponding to the desired parameters already exist and stop if it does if os.path.isfile(output_file): print() print( "The desired encoded version of the input corpus '%s' already exists at file '%s'." % (os.path.basename(corpus_name), os.path.basename(output_file))) return output_file else: print( strftime("%Y-%m-%d %H:%M:%S") + ": Started encoding utterances from input corpus '%s'" % corpus_name) # get the corpus recoded into phonological cues and lexical outcomes, plus the percentage of utterances that # could not be recoded because one or more words do not have a corresponding entry in Celex encoded_corpus, missed = encode_corpus(corpus_name, celex_dict, tokens2identifiers, pos_dict, separator=separator, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, stress_marker=stress_marker, outcomes=outcomes, boundaries=boundaries) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Finished encoding utterances from input corpus '%s'" % corpus_name) print() perc_covered = 100 - missed print() if os.path.exists(output_file): print("The file %s already exists." % output_file) else: json.dump(encoded_corpus, open(output_file, 'w')) print("The file %s has been created:" % output_file) print() print("%0.4f%% of the utterances could be entirely encoded." % perc_covered) print( "The remaining %0.4f%% contain at least one word that could not be retrieved in CELEX and " % missed) print( "for which no phonological and morphological representation could be obtained." ) return output_file
def main(): parser = argparse.ArgumentParser( description= "Compute the variance of each phonological cue and token, as a proxy " "to identify the amount of information they carry") parser.add_argument( "-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument( "-C", "--celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument( "-O", "--output_folder", required=True, dest="output_folder", help= "Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument( "-M", "--pos_mapping", required=True, dest="pos_mapping", help= "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags." ) parser.add_argument( "-p", "--precision", dest="precision", default=5, help= "Specify the number of outcomes to consider when computing discrimination's precision." ) parser.add_argument( "-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() cues = ['triphones'] outcomes = ['tokens'] stress_marker = [True] boundaries = [True] reduced_vowels = [False] number_of_cues = [100, 1000] if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) cues_variance_file = os.path.join(args.output_folder, "words_cues_variance.csv") tokens_variance_file = os.path.join(args.output_folder, "words_tokens_variance.csv") parametrizations = it.product(cues, outcomes, stress_marker, boundaries, reduced_vowels) time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] cues_table = pd.DataFrame( index=[], columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "Phonological_cue", "Variance", "Frequency", "Lexical_diversity", "Phonological_diversity", "Cue|Cues_predictability", "Cue|Tokens_predictability", "Cues|Cue_predictability", "Tokens|Cue_predictability" ]) tokens_table = pd.DataFrame( index=[], columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "numCues", 'Token', "Variance", "Frequency", "Lexical_diversity", "Phonological_diversity", "Token|Tokens_predictability", "Token|Cues_predictability", "Tokens|Token_predictability", "Cues|Token_predictability" ]) ii = 0 jj = 0 for parametrization in parametrizations: print(parametrization) cue_type, outcome, stress, boundary, reduced = parametrization uniphones = True if cue_type == 'uniphones' else False diphones = True if cue_type == 'diphones' else False triphones = True if cue_type == 'triphones' else False syllables = True if cue_type == 'syllables' else False vowels = 'reduced' if reduced else 'full' sm = "stress" if stress else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=stress, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcome, boundaries=boundary) print( strftime("%Y-%m-%d %H:%M:%S") + ": Started computing distributional statistics from the corpus...") token_statistics, cue_statistics = usf.compute_distributional_predictors( encoded_corpus, time_points) print( strftime("%Y-%m-%d %H:%M:%S") + ": ...finished computing distributional statistics from the corpus." ) print() corpus_dir = os.path.dirname(encoded_corpus) a, b = [0.001, 0.001 ] if training == 'aggregate_utterances' else [0.01, 0.01] file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced) for idx, file_path in file_paths.items(): idx = int(idx) matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join([ 'discriminatedOutcomes', str(idx), ''.join(['at', args.precision]), 'json' ])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=int(args.precision)) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": The discriminated outcomes have been identified (file: %s)." % discriminated_file) print() row_variances, matrix, discriminated = usf.get_cue_variances( matrix, discriminated) cue_variances = {} for cue in cues2ids: cue_idx = cues2ids[cue] cue_variances[cue] = row_variances[cue_idx] print( strftime("%Y-%m-%d %H:%M:%S") + ": Started storing cue variances...") for cue in cue_variances: if len(cue_statistics[cue]['freq']) == 10: frequency = cue_statistics[cue]['freq'][idx] lexical_diversity = cue_statistics[cue]['lexdiv'][idx] phonological_diversity = cue_statistics[cue]['phondiv'][ idx] # average conditional probability of a cue given the co-occurring cues cue_cues_predictability = cue_statistics[cue][ 'p_cue_cues'][idx] # average predictive power of a cue with respect to all the co-occurring cues cues_cue_predictability = cue_statistics[cue][ 'p_cues_cue'][idx] # average conditional probability of a cue given the co-occurring tokens cue_tokens_predictability = cue_statistics[cue][ 'p_cue_tokens'][idx] # average predictive power of a cue with respect to all the co-occurring tokens tokens_cue_predictability = cue_statistics[cue][ 'p_tokens_cue'][idx] cues_table.loc[ii] = pd.Series({ "Corpus": training, "Cues": cue_type, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": idx, "Precision": int(args.precision), "Phonological_cue": cue, "Variance": cue_variances[cue], "Frequency": frequency, "Lexical_diversity": lexical_diversity, "Phonological_diversity": phonological_diversity, "Cue|Cues_predictability": cue_cues_predictability, "Cues|Cue_predictability": cues_cue_predictability, "Cue|Tokens_predictability": cue_tokens_predictability, "Tokens|Cue_predictability": tokens_cue_predictability }) ii += 1 print( strftime("%Y-%m-%d %H:%M:%S") + ": ...finished storing cue variances.") print() for how_many_cues in number_of_cues: print("Number of cues: ", how_many_cues) token_variances = usf.get_token_variances( matrix, discriminated, row_variances, how_many_cues=how_many_cues) print( strftime("%Y-%m-%d %H:%M:%S") + ": Started storing token variances...") for token in token_variances: if len(token_statistics[token]['freq']) == 10: frequency = token_statistics[token]['freq'][idx] lexical_diversity = token_statistics[token]['lexdiv'][ idx] phonological_diversity = token_statistics[token][ 'phondiv'][idx] # average conditional probability of a token given the co-occurring tokens token_tokens_predictability = token_statistics[token][ 'p_token_tokens'][idx] # average predictive power of a token with respect to the co-occurring tokens tokens_token_predictability = token_statistics[token][ 'p_tokens_token'][idx] # average conditional probability of a token given the co-occurring phonological cues token_cues_predictability = token_statistics[token][ 'p_token_cues'][idx] # average predictive power of a token with respect to the co-occurring phonological cues cues_token_predictability = token_statistics[token][ 'p_cues_token'][idx] tokens_table.loc[jj] = pd.Series({ "Corpus": training, "Cues": cue_type, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": idx, "numCues": how_many_cues, "Precision": int(args.precision), "Token": token, "Variance": token_variances[token], "Frequency": frequency, "Lexical_diversity": lexical_diversity, "Phonological_diversity": phonological_diversity, "Token|Tokens_predictability": token_tokens_predictability, "Tokens|Token_predictability": tokens_token_predictability, "Token|Cues_predictability": token_cues_predictability, "Cues|Token_predictability": cues_token_predictability }) jj += 1 print( strftime("%Y-%m-%d %H:%M:%S") + ": ...finished storing token variances.") print() print('-' * 100) print() print() print() print('=' * 100) print('=' * 100) print() print() print() print() print() print('#' * 100) print('#' * 100) print('#' * 100) print() print() print() if os.path.exists(cues_variance_file): cues_table.to_csv(cues_variance_file, sep='\t', index=False, mode="a", header=False) else: cues_table.to_csv(cues_variance_file, sep='\t', index=False) if os.path.exists(tokens_variance_file): tokens_table.to_csv(tokens_variance_file, sep='\t', index=False, mode="a", header=False) else: tokens_table.to_csv(tokens_variance_file, sep='\t', index=False)
def map_phonology(corpus_file, mapping_file, output_file, celex_dir, compounds=True, reduced=False, minimalist=True): """ :param corpus_file: the path to a .txt file containing one utterance per line, with all words in the utterance separated by a comma and each word being a tuple consisting of four pipe-separated elements, token|lemma|PoS1|PoS2 where PoS1 is the coarse Celex tag and PoS2 is the tag provided by the TreeTagger :param mapping_file: the path to a .txt file where the output of the process will be written to :param output_file: the path to a .txt file where the lines from the input will be rewritten as comma-separated sequences of pipe-separated 5-tuples consisting of token, lemma, pos1, pos2, 3phones :param celex_dir: the directory where the Celex dictionary is to be found :param compounds: a boolean. If true, all entries in Celex are considered; if False, entries which contain spaces are discarded :param minimalist: a boolean. It specifies whether lemmas in the output file should be differentiated when their phonetic realization changes depending on the part of speech: if minimalist is True, lemmas are not differentiated (default), if it is False, lemmas are differentiated by appending pos1 to the lemma, separated by a colon :return mapping: a dictionary mapping 4-tuples token|lemma|pos1|pos2 to the matching triphones """ celex_dict = get_celex_dictionary(celex_dir, reduced=reduced, compounds=compounds) tokens2identifiers = tokens2ids(celex_dict) mapping = {} lemma2phon = defaultdict(dict) new_corpus = [] with open(corpus_file, 'r') as fr: for line in fr: words = line.strip().split(',') new_line = [] for word in words: if word: try: token, lemma, pos1, pos2 = word.split('|') except ValueError: token, lemma, pos1 = word.split('|') pos2 = 'NN' if pos1 == 'N' else pos1 new_token, new_lemma = adjust_apostrophes(token, lemma) new_token = new_token.replace('=', '_') new_lemma = new_lemma.replace('=', '_') token_phonological_form = get_phonetic_encoding([(new_token, pos1, new_lemma)], celex_dict, tokens2identifiers) lemma_phonology = get_phonetic_encoding([(new_lemma, pos1, new_lemma)], celex_dict, tokens2identifiers) lemma_phonological_form = ''.join(lemma_phonology) if isinstance(lemma_phonology, list) else \ ''.join(token_phonological_form) if isinstance(token_phonological_form, list): triphones = encode_item(token_phonological_form[0], triphones=True, stress_marker=True, uniphones=False, diphones=False, syllables=False) deriv = code_derivational_morphology(pos2) output_token = token.replace('_', '=') output_lemma = lemma.replace('_', '=') morpho = 'COMPOUND' if '=' in output_token else 'MONO' key = '|'.join([output_token, output_lemma, pos1, pos2, deriv, morpho, ':'.join([output_token, pos1])]) output_triphones = ';'.join(triphones) mapping[key] = output_triphones if lemma_phonological_form in lemma2phon[output_lemma]: lemma2phon[output_lemma][lemma_phonological_form].add(pos1) else: lemma2phon[output_lemma][lemma_phonological_form] = {pos1} new_line.append((output_token, ':'.join([output_lemma, pos1]), pos1, pos2, output_triphones)) new_corpus.append(new_line) write_mapping_file(mapping, mapping_file) write_output_corpus(new_corpus, output_file, lemma2phon, minimalist=minimalist) return mapping
def main(): parser = argparse.ArgumentParser( description= "Assess whether words from the same category cluster together" "first considering their sound patterns and then how they correlate" "to each other based on their contexts of occurrence") parser.add_argument( "-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument( "-C", "--celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument( "-O", "--output_folder", required=True, dest="output_folder", help= "Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument( "-M", "--pos_mapping", required=True, dest="pos_mapping", help= "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags." ) parser.add_argument( "-p", "--precision", dest="precision", default=5, help= "Specify the number of outcomes to consider when computing discrimination's precision." ) parser.add_argument( "--cue_threshold", dest="cue_threshold", default='high', help= "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant cues or" "a low (i.e. lax) one.") parser.add_argument( "--token_threshold", dest="token_threshold", default='low', help= "Specify whether to choose a 'high' (i.e. strict) the threshold on relevant tokens or" "a low (i.e. lax) one.") parser.add_argument( "-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() # thresholds have been determined manually according to the following criteria: # - high thresholds were set to yield around 100 dimensions at time t100 # - low thresholds were set to yield around 100 dimensions at time t10 # Whenever a threshold value didn't yield any dimension because the value was too stringent, the threshold was # lowered until at least 1 dimension was available at all time points. First, I adjusted the threshold on # phonological cues' variance, then on tokens' variance. Practically, it was always possible to set thresholds to # yield around 100 dimensions at the specified time points, except for tokens' variance in the low variance setting, # wheere the threshold yielding around 100 dimensions at t10 quickly left the model without any dimension. # Therefore, these models start with considerably high dimensionalities, and finishes with almost no dimensionans. thresholds = { 'aggregate_utterances_at5_c_low': 0.00000001, # 99 cues at t10 (799 at t100) 'aggregate_utterances_at5_c_high': 0.00000075, # 105 cues at t100 (2 cues at t10) 'aggregate_utterances_at5_t_low': 0.02, # 741 tokens at t10 (3 tokens at t100) 'aggregate_utterances_at5_t_high': 0.04, # 125 tokens at 100 (843 tokens at t10) 'aggregate_utterances_at25_c_low': 0.000000005, # 98 cues at t10 (802 cues at t100) 'aggregate_utterances_at25_c_high': 0.00000025, # 156 cues at t100 (2 cues at t10) 'aggregate_utterances_at25_t_low': 0.015, # 1632 tokens at t10 (19 tokens at t100) 'aggregate_utterances_at25_t_high': 0.033, # 140 tokens at t100 (1731 at t10) 'aggregate_words_at5_c_low': 0.000025, # 101 cues at t10 (318 cues at t100) 'aggregate_words_at5_c_high': 0.00005, # 131 cues at t100 (40 cues at t10) 'aggregate_words_at5_t_low': 0.0325, # 834 tokens at t10 (3 tokens at t100) 'aggregate_words_at5_t_high': 0.07, # 95 tokens at t100 (1004 tokens at t10) 'aggregate_words_at25_c_low': 0.0000075, # 117 cues at t10 (416 cues at t100) 'aggregate_words_at25_c_high': 0.00002, # 124 cues at t100 (33 cues at t10) 'aggregate_words_at25_t_low': 0.0295, # 1850 tokens at t10 (7 tokens at t100) 'aggregate_words_at25_t_high': 0.085 # 113 tokens at t100 (2581) } cues = ['triphones'] outcomes = ['tokens'] stress_marker = [True] boundaries = [True] reduced_vowels = [False] distances = ['correlation'] if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) summary_file = os.path.join(args.output_folder, "LDAt_summary.csv") # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv") parametrizations = it.product(cues, outcomes, stress_marker, boundaries, reduced_vowels, distances) time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] rows = int( np.prod([ len(cues), len(outcomes), len(stress_marker), len(reduced_vowels), len(boundaries), len(time_points), len(distances) ])) summary_table = pd.DataFrame( index=np.arange(0, rows), columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "Distance", "tCues", "numCues", "tTokens", "numTokens", "Phon_acc", "Phon_acc_subset", "Phon_baseline", "Distr_acc", "Distr_acc_subset", "Distr_baseline" ]) ii = 0 for parametrization in parametrizations: cue, outcome, stress, boundary, reduced, distance = parametrization uniphones = True if cue == 'uniphones' else False diphones = True if cue == 'diphones' else False triphones = True if cue == 'triphones' else False syllables = True if cue == 'syllables' else False vowels = 'reduced' if reduced else 'full' sm = "stress" if stress else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=stress, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcome, boundaries=boundary) corpus_dir = os.path.dirname(encoded_corpus) # precision at 25 a, b = [0.001, 0.001 ] if training == 'aggregate_utterances' else [0.01, 0.01] c = thresholds['_'.join([ training, ''.join(['at', args.precision]), 'c', args.cue_threshold ])] t = thresholds['_'.join([ training, ''.join(['at', args.precision]), 't', args.token_threshold ])] file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced) for idx, file_path in sorted(file_paths.items(), key=operator.itemgetter(0)): matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join([ 'discriminatedOutcomes', str(int(idx)), ''.join(['at', args.precision]), 'json' ])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=args.precision) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( "The discriminated outcomes have been identified (file: %s)." % discriminated_file) accuracies = threshold_experiment(matrix, discriminated, cues_threshold=c, tokens_threshold=t) summary_table.loc[ii] = pd.Series({ "Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "Distance": distance, "Precision": args.precision, "tCues": args.cue_threshold, "numCues": accuracies[3], "tTokens": args.token_threshold, "numTokens": accuracies[7], "Phon_acc": accuracies[0], "Phon_acc_subset": accuracies[1], "Distr_acc": accuracies[4], "Distr_acc_subset": accuracies[5], "Phon_baseline": accuracies[2], "Distr_baseline": accuracies[6] }) ii += 1 if os.path.exists(summary_file): summary_table.to_csv(summary_file, sep='\t', index=False, mode="a", header=False) else: summary_table.to_csv(summary_file, sep='\t', index=False)
def cluster_words(corpus, output_folder, celex_folder, pos_mapping, distance='cosine', reduced=False, outcomes='tokens', uniphones=False, diphones=False, triphones=True, syllables=False, stress_marker=True, boundaries=True, at=5, nn=25, a=0.01, b=0.01, longitudinal=False): """ :param corpus: the corpus to be used for training the model :param output_folder: the folder where the logfile of the clustering experiment will be saved :param celex_folder: the folder containing the data from the Celex database :param pos_mapping: the path to the file mapping CHILDES pos tags to Celex tags :param distance: a string (either 'correlation' or 'cosine' indicating which distance metric to use :param reduced: a boolean indicating whether to use reduced or full phonetic transcriptions from Celex :param outcomes: a string (either 'tokens' or 'lemmas') indicating which outcomes to consider for learning :param uniphones: a boolean indicating whether to consider uniphones as cues :param diphones: a boolean indicating whether to consider diphones as cues :param triphones: a boolean indicating whether to consider triphones as cues :param syllables: a boolean indicating whether to consider syllables as cues :param stress_marker: a boolean indicating whether to consider or discard stress information :param boundaries: a boolean indicating whether to consider or discard word boundaries :param at: an integer indicating how many outcomes to compute to compute discrimination's precision :param nn: an integer indicating how many nearest neighbors to consider when evaluating clustering :param a: the alpha parameter from the Rescorla Wagner model :param b: the beta parameter from the Rescorla Wagner model :param longitudinal: a boolean indicating whether to adopt a longitudinal design or not :return accuracies: a dictionary mapping time indices to the clustering accuracy obtained at that time point """ if not os.path.exists(output_folder): os.makedirs(output_folder) plot_folder = os.path.join(output_folder, 'plots') if not os.path.exists(plot_folder): os.makedirs(plot_folder) encoded_corpus = corpus_encoder(corpus, celex_folder, pos_mapping, separator='~', stress_marker=stress_marker, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcomes, boundaries=boundaries) corpus_dir = os.path.dirname(encoded_corpus) file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=longitudinal) celex_dict = get_celex_dictionary(celex_folder, reduced=reduced) accuracies = {} for idx, file_path in file_paths.items(): logfile = make_log_file(corpus, output_folder, 'json', dist=distance, nn=nn, at=at, time=idx, outcomes=outcomes, reduced=reduced, stress_marker=stress_marker, boundaries=boundaries, syllables=syllables, uniphones=uniphones, diphones=diphones, triphones=triphones) plotfile = make_log_file(corpus, plot_folder, 'pdf', dist=distance, nn=nn, at=at, time=idx, outcomes=outcomes, reduced=reduced, stress_marker=stress_marker, boundaries=boundaries, syllables=syllables, uniphones=uniphones, diphones=diphones, triphones=triphones) if os.path.exists(logfile): print() print( "The file %s already exists, statistics for the corresponding " "parametrization are loaded from it" % logfile) clusters = json.load(open(logfile, "r")) else: print() matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join(['discriminatedOutcomes', str(int(idx)), 'json'])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=at) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Start clustering, using %s as weight matrix..." % (os.path.basename(file_path))) if distance == 'cosine': similarities, discriminated = sim.pairwise_cos( matrix, discriminated, plot_path=plotfile) else: similarities, discriminated = sim.pairwise_corr( matrix, discriminated, plot_path=plotfile) df = sim.sim2df(similarities, discriminated) similarities_file = os.path.join( corpus_dir, '.'.join(['similarities', distance, str(int(idx)), 'csv'])) df.to_csv(similarities_file, sep='\t') clusters = sim.neighborhood(discriminated, similarities, nn=nn) json.dump(clusters, open(logfile, 'w')) print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.") accuracy, baseline_acc, h, baseline_h = sim.clustering_precision( clusters) accuracies[idx] = { 'accuracy': accuracy, 'baseline_acc': baseline_acc, 'entropy': h, 'baseline_entr': baseline_h } return accuracies
def main(): parser = argparse.ArgumentParser( description= "Assess whether words from the same category cluster together" "first considering their sound patterns and then how they correlate" "to each other based on their contexts of occurrence") parser.add_argument( "-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument( "-C", "--celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument( "-O", "--output_folder", required=True, dest="output_folder", help= "Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument( "-M", "--pos_mapping", required=True, dest="pos_mapping", help= "Specify the path of the file containing the mapping from CHILDES to Celex PoS tags." ) parser.add_argument( "-p", "--precision", dest="precision", default=5, help= "Specify the number of outcomes to consider when computing discrimination's precision." ) parser.add_argument( "-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() cues = ['triphones'] outcomes = ['tokens'] stress_marker = [True] boundaries = [True] reduced_vowels = [False] distances = ['correlation'] number_of_cues = [100, 500, 1000] number_of_tokens = [50, 250, 500] if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) summary_file = os.path.join(args.output_folder, "LDAk_summary.csv") # error_file = os.path.join(args.output_folder, "PoStagging_errors.csv") parametrizations = it.product(cues, outcomes, stress_marker, boundaries, reduced_vowels, distances, number_of_cues, number_of_tokens) time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] rows = int( np.prod([ len(cues), len(outcomes), len(stress_marker), len(reduced_vowels), len(boundaries), len(time_points), len(distances), len(number_of_cues), len(number_of_tokens) ])) summary_table = pd.DataFrame(index=np.arange(0, rows), columns=[ "Corpus", "Cues", "Outcomes", "Stress", "Vowels", "Precision", "Time", "Distance", "numCues", "numTokens", "Phon_acc", "Phon_acc_subset", "Phon_baseline", "Distr_acc", "Distr_acc_subset", "Distr_baseline" ]) ii = 0 for parametrization in parametrizations: print(parametrization) cue, outcome, stress, boundary, reduced, distance, how_many_cues, how_many_tokens = parametrization uniphones = True if cue == 'uniphones' else False diphones = True if cue == 'diphones' else False triphones = True if cue == 'triphones' else False syllables = True if cue == 'syllables' else False vowels = 'reduced' if reduced else 'full' sm = "stress" if stress else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=stress, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, outcomes=outcome, boundaries=boundary) corpus_dir = os.path.dirname(encoded_corpus) a, b = [0.001, 0.001 ] if training == 'aggregate_utterances' else [0.01, 0.01] file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) celex_dict = get_celex_dictionary(args.celex_folder, reduced=reduced) for idx, file_path in file_paths.items(): matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those whose jaccard coefficient # between true phonetic cues and most active phonetic cues for the outcome is 1 discriminated_file = os.path.join( corpus_dir, '.'.join([ 'discriminatedOutcomes', str(int(idx)), ''.join(['at', args.precision]), 'json' ])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables, boundaries=boundaries, at=int(args.precision)) json.dump(discriminated, open(discriminated_file, 'w')) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( "The discriminated outcomes have been identified (file: %s)." % discriminated_file) accuracies = subset_experiment(matrix, discriminated, how_many_cues=how_many_cues, how_many_tokens=how_many_tokens) summary_table.loc[ii] = pd.Series({ "Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "Distance": distance, "Precision": args.precision, "numCues": how_many_cues, "numTokens": how_many_tokens, "Phon_acc": accuracies[0], "Phon_acc_subset": accuracies[1], "Distr_acc": accuracies[3], "Distr_acc_subset": accuracies[4], "Phon_baseline": accuracies[2], "Distr_baseline": accuracies[5] }) ii += 1 if os.path.exists(summary_file): summary_table.to_csv(summary_file, sep='\t', index=False, mode="a", header=False) else: summary_table.to_csv(summary_file, sep='\t', index=False)
def main(): parser = argparse.ArgumentParser(description="Assess whether words from the same category cluster together" "on the basis of the sound sequences they consist of.") parser.add_argument("-c", "--corpus", required=True, dest="corpus", help="Specify the path to the training corpus (encoded as .json).") parser.add_argument("-C", "--Celex_folder", required=True, dest="celex_folder", help="Specify the folder where the Celex data are located.") parser.add_argument("-O", "--output_folder", required=True, dest="output_folder", help="Specify the path of the folder where the logfiles will be stored together with" "the summary tables.") parser.add_argument("-M", "--pos_mapping", required=True, dest="pos_mapping", help="Specify the path of the file containing the mapping from CHILDES to Celex PoS tags.") parser.add_argument("-p", "--precision", dest="precision", default=5, help="Specify the number of outcomes to consider when computing discrimination's precision.") parser.add_argument("-l", "--longitudinal", action="store_true", dest="longitudinal", help="Specify whether to use a longitudinal design (default: False).") args = parser.parse_args() at = args.precision if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) summary_file = os.path.join(args.output_folder, "lexicalDevelopment_summary.csv") a, b = [0.01, 0.01] reduced_vowels = [False] boundaries = [True] outcomes = ['tokens'] cues = ['triphones', 'syllables'] stress_marker = [True] time_points = np.linspace(10, 100, 10) if args.longitudinal else [100] rows = int(np.prod([len(cues), len(outcomes), len(stress_marker), len(reduced_vowels), len(boundaries), len(time_points)])) summary_table = pd.DataFrame(index=np.arange(0, rows), columns=["Corpus", "Boundaries", "Cues", "Outcomes", "Stress", "Vowels", "Time", "At", "Discriminated", "@".join(["Precision", str(at)]), "Jaccard@1", "Total"]) row_id = 0 parametrizations = it.product(reduced_vowels, boundaries, outcomes, cues, stress_marker) for parametrization in parametrizations: r, boundary, outcome, cue, marker = parametrization uniphones = True if cue == 'uniphones' else False diphones = True if cue == 'diphones' else False triphones = True if cue == 'triphones' else False syllables = True if cue == 'syllables' else False vowels = 'reduced' if r else 'full' sm = "stress" if marker else 'no-stress' bound = 'yes' if boundary else 'no' training = os.path.splitext(os.path.basename(args.corpus))[0] celex_dict = get_celex_dictionary(args.celex_folder, reduced=r) encoded_corpus = corpus_encoder(args.corpus, args.celex_folder, args.pos_mapping, separator='~', stress_marker=marker, reduced=r, outcomes=outcome, boundaries=boundary, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllables) cumulative_vocabulary = get_cumulative_vocabulary(encoded_corpus, time_points) print() print("The cumulative vocabulary for the file %s has been estimated" % encoded_corpus) print() file_paths = ndl(encoded_corpus, alpha=a, beta=b, lam=1, longitudinal=args.longitudinal) for idx, file_path in file_paths.items(): matrix, cues2ids, outcomes2ids = load(file_path) # get the Jaccard coefficient for each outcome and select those with a coefficient of 1, meaning that the # model would choose all and only the correct cues when expressing an the outcome; get the number of such # outcomes print() jaccard_coefficients = jaccard(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=marker, uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables, boundaries=boundaries) jaccard_one = {} for token in outcomes2ids: if jaccard_coefficients[token] == 1: jaccard_one[token] = outcomes2ids[token] n_jaccard = len(jaccard_one) print() # get the outcomes that are correctly discriminated given the cues they consist of: in detail, take an # outcome, encode it in its phonetic cues, check which outcomes are most active given such cues, check # whether the correct one is among the top ones (how many is indicated by the parameter 'at'; store all # outcomes where the correct one is among the top active ones given the cues in it print() precise = precision_at(matrix, outcomes2ids, cues2ids, celex_dict, stress_marker=marker, uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables, boundaries=boundaries, at=at) n_precise = len(precise) print() # repeat but only for the outcomes with a Jaccar coefficient of 1, to quantify two-way discrimination print() discriminated = precision_at(matrix, jaccard_one, cues2ids, celex_dict, stress_marker=marker, uniphone=uniphones, diphone=diphones, triphone=triphones, syllable=syllables, boundaries=boundaries, at=at) n_discriminated = len(discriminated) print() vocabulary_estimate = cumulative_vocabulary[int(idx)] summary_table.loc[row_id] = pd.Series({"Corpus": training, "Cues": cue, "Outcomes": outcome, "Stress": sm, "Boundaries": bound, "Vowels": vowels, "Time": int(idx), "At": at, "Discriminated": n_discriminated, "Total": vocabulary_estimate, "@".join(["Precision", str(at)]): n_precise, "Jaccard@1": n_jaccard}) row_id += 1 if os.path.exists(summary_file): summary_table.to_csv(summary_file, sep='\t', index=False, mode="a", header=False) else: summary_table.to_csv(summary_file, sep='\t', index=False)
def tag_words(input_file, test_set, celex_dir, pos_mapping, output_folder, method='freq', evaluation='count', k=50, flush=0, threshold=0, separator='~', reduced=False, outcomes='tokens', boundaries=True, uniphones=True, diphones=False, triphones=False, syllable=False, stress_marker=False, alpha=0.01, beta=0.01, lam=1.0, longitudinal=False, at=5): """ :param input_file: a .json file containing transcripts of child-caregiver interactions extracted from the CHILDES database. The json file consists of two lists of lists, of the same length, both contain utterances but encoded differently. The first encodes each utterance as a list of tokens; the second encodes each utterance as a list of lemmas and Part-of-Speech tags, joined by a vertical bar ('|') :param test_set: a dictionary mapping the file name to: - 'test_set['filename']: the basename of the file - 'test_set['items']: the set of phonological forms to be categorized, complete of the target PoS tag (phonological form and PoS tag are separated by a vertical bar ('|') :param celex_dir: a string specifying the path to the Celex directory :param pos_mapping: a .txt file mapping CHILDES PoS tags to CELEX tags :param output_folder: the path to the folder where the logfiles will be saved :param method: a string indicating the way in which the function looks at top active outcomes; two options are available: - 'freq' makes the function compute the distribution of PoS tags over the k top active nodes (see the explanation of the parameter k) and rank PoS tags according to their frequency among the top active cues - 'sum' makes the function compute the sum of activation from all outcomes belonging to a given PoS tag within the k top active outcomes given the input cues, and rank PoS tags according to their total activation among the top active cues :param evaluation: a string indicating how to compare baseline activations to item-triggered ones; two options are available: - 'count', simply tag the test item with the PoS tag that either was more frequent or had highest summed activation within the top active outcomes; frequency or activation are returned and can be correlated to reaction times - 'distr', compare the frequency counts or summed activations generated by a specific test item to the frequency counts or summed activations at baseline and tag the test item with the PoS tag receiving highest support by the change in the distribution of frequencies or summed activations (a statistic is returned, Chi-squared for frequency distributions and t-test for summed activations, whose value can be correlated to reaction times) :param k: an integer specifying how many elements to consider from the baseline activations and the activations triggered by a specific test item. By default, the top 50 outcomes are considered, and compared according to the chosen combination of method and eval :param flush: specify whether (and how many) top active outcome at baseline to flush away from subsequent computations. It may be the case that whatever the input cues, the same high frequency outcomes come out as being the most active. It may then make sense to not consider them when evaluating the distribution of lexical categories over the most active outcomes given an input item :param threshold: the minimum activation of an outcome to be considered in the list of top activated neighbors, default is 0 and shouldn't be lowered, but can be increased. :param separator: the character that separates the word baseform from its PoS tag in the input corpus :param reduced: a boolean specifying whether reduced phonological forms should be extracted from Celex whenever possible (if set to True) or if standard phonological forms should be preserved (if False) :param outcomes: a string indicating which outcomes to use, whether 'tokens' (default) or 'lemmas' :param boundaries: a boolean specifying whether word boundaries are to be considered when training on full utterances :param uniphones: a boolean indicating whether single phonemes are to be considered while encoding input utterances :param diphones: a boolean indicating whether sequences of two phonemes are to be considered while encoding input utterances :param triphones: a boolean indicating whether sequences of three phonemes are to be considered while encoding input utterances :param syllable: a boolean indicating whether syllables are to be considered while encoding input utterances :param stress_marker: a boolean indicating whether stress markers from the phonological representations of Celex need to be preserved or can be discarded :param alpha: a number indicating cue salience. For simplicity, we assume that every cue has the same salience, so changing the value of this parameter does not affect the relative strength of of cue-outcome associations but only their absolute magnitude :param beta: a number indicating the learning rate for positive and negative situations. Again, we make the simplifying assumption that our simulated learners are equally affected by positive and negative feedback. Changing the beta value can have a significant impact on the learning outcome, but 0.1 is a standard choice for this model. If the number of learning trials or the number of different cues in a learning trial are very large, both beta and alpha need to be lowered considerably :param lam: maximum amount of association that an outcome can receive from all the cues. It simply acts as a scaling factor, so changing its value has the same effects of changing alpha :param longitudinal: a boolean specifying whether to work in a longitudinal setting or not :param at: the number of top active outcomes to consider to compute precision :return accuracies: a dictionary mapping the categorization accuracy on the PoS tagging experiment to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) :return entropies: a dictionary mapping the normalized entropy of the distribution of the PoS tags assigned by the model to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) :return most_frequents: a dictionary mapping the PoS tag that was applied the most by the model to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) :return frequencies: a dictionary mapping the frequency count of the most frequent PoS tag applied by the model, to each time index (1 if the longitudinal parameter is set to False, 10 if it's set to True) """ if not os.path.exists(output_folder): os.makedirs(output_folder) encoded_corpus = corpus_encoder(input_file, celex_dir, pos_mapping, separator=separator, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllable, stress_marker=stress_marker, reduced=reduced, outcomes=outcomes, boundaries=boundaries) file_paths = ndl(encoded_corpus, alpha=alpha, beta=beta, lam=lam, longitudinal=longitudinal) # for each test item, compute the items from the matrix of weights that are most activated given the cues in the # item, get the PoS tag that is most present among the most active lexical nodes and check whether the predicted # PoS tag matches the gold-standard one provided along the test item. Return a global score indicating the accuracy # on the test set accuracies = {} entropies = {} most_frequents = {} frequencies = {} log_dicts = {} celex_dict = get_celex_dictionary(celex_dir, reduced=reduced) for idx, file_path in file_paths.items(): logfile = make_log_file(input_file, test_set['filename'], output_folder, method, evaluation, flush, k, at, idx, reduced=reduced, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllable, stress_marker=stress_marker, outcomes=outcomes, boundaries=boundaries) if os.path.exists(logfile): print() print( "The file %s already exists, statistics for the corresponding parametrization are loaded from it" % logfile) log_dict = json.load(open(logfile, "r")) else: print() matrix, cues2ids, outcomes2ids = load(file_path) # get the column ids of all perfectly discriminated outcomes at the current time point # perfectly discriminated outcomes are considered to be those: # - whose jaccard coefficient between true phonetic cues and most active phonetic cued for the outcome is 1 # - and that appear in the top active outcomes given the cues they consist of corpus_folder = os.path.dirname(encoded_corpus) discriminated_file = os.path.join( corpus_folder, '.'.join(['discriminatedOutcomes', str(int(idx)), 'json'])) if not os.path.exists(discriminated_file): discriminated = find_discriminated(matrix, cues2ids, outcomes2ids, celex_dict, stress_marker=stress_marker, uniphones=uniphones, diphones=diphones, triphones=triphones, syllables=syllable, boundaries=boundaries, at=at) else: discriminated = json.load(open(discriminated_file, 'r')) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Start test phase, using %s as weight matrix and %s as test set..." % (os.path.basename(file_path), os.path.basename(test_set['filename']))) log_dict = categorize(test_set['items'], matrix, cues2ids, discriminated, method=method, evaluation=evaluation, flush=flush, k=k, threshold=threshold, stress_marker=stress_marker, syllables=syllable, uniphones=uniphones, diphones=diphones, triphones=triphones, boundaries=boundaries) json.dump(log_dict, open(logfile, 'w')) print(strftime("%Y-%m-%d %H:%M:%S") + ": ...completed test phase.") f1, h, pos, freq = compute_summary_statistics(log_dict) accuracies[idx] = f1 entropies[idx] = h most_frequents[idx] = pos frequencies[idx] = freq log_dicts[idx] = log_dict print("Accuracy: %0.5f" % f1) print() return log_dicts, accuracies, entropies, most_frequents, frequencies
def write_learning_events(corpus_file, output_folder, celex_dir, pos_dict, separator='~', uni_phones=False, di_phones=False, tri_phones=True, syllable=False, stress_marker=True, boundaries=False): """ :param corpus_file: a path pointing to .json object to be used as input corpus, consisting of two aligned lists of lists, meaning that a second-order list in each first order list refers to a same utterance; the first list contains utterances encoded as lists of tokens, the second list contains utterances encoded as lists of lemmas and PoS tags :param output_folder: the path to a folder where the output files for cues and outcomes will be written to :param celex_dir: the path to the directory where the Celex dictionary is to be found (if no dictionary is found at the given location, one is built on the fly :param pos_dict: a dictionary mapping CHILDES PoS tags to corresponding Celex PoS tags :param separator: a string indicating the character separating lemmas from PoS tags in the input corpus :param uni_phones: a boolean indicating whether uni-phones are relevant phonetic cues :param di_phones: a boolean indicating whether di-phones are relevant phonetic cues :param tri_phones: a boolean indicating whether tri-phones are relevant phonetic cues :param syllable: a boolean indicating whether syllables are relevant phonetic cues :param stress_marker: a boolean indicating whether to discard or not the stress marker from the Celex phonetic transcriptions """ celex_dict = get_celex_dictionary(celex_dir, reduced=False) tokens2identifiers = tokens2ids(celex_dict) pos_dict = get_pos_mapping(pos_dict) corpus = json.load(open(corpus_file, 'r+')) # use the path of the input file to generate the path of the output file, adding encoding information to the # input filename; print to standard output a summary of all the encoding parameters input_filename, extension = os.path.splitext(corpus_file) if not os.path.exists(output_folder): os.makedirs(output_folder) cue_file = os.path.join(output_folder, "cues.txt") outcome_file = os.path.join(output_folder, "outcomes.txt") # check whether the output file corresponding to the desired parameters already exist and stop if it does if os.path.isfile(cue_file) and os.path.isfile(outcome_file): print() print( "The desired encoded version of the input corpus '%s' already exists at files '%s' and '%s'." % (os.path.basename(corpus_file), os.path.basename(cue_file), os.path.basename(outcome_file))) return cue_file, outcome_file else: print( strftime("%Y-%m-%d %H:%M:%S") + ": Started encoding utterances from input corpus '%s'" % corpus_file) # get the corpus recoded into phonological cues and lexical outcomes cues, outcomes = encode_corpus(corpus, celex_dict, tokens2identifiers, pos_dict, separator=separator, uni_phones=uni_phones, di_phones=di_phones, tri_phones=tri_phones, syllable=syllable, stress_marker=stress_marker, boundaries=boundaries) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Finished encoding utterances from input corpus '%s'" % corpus_file) print() corpus2txt(cues, cue_file) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Written encoded cues to '%s'" % cue_file) print() corpus2txt(outcomes, outcome_file) print() print( strftime("%Y-%m-%d %H:%M:%S") + ": Written encoded outcomes to '%s'" % outcome_file) print()