def get_words_and_contexts(tokens, filtered_corpus, min_length, size, boundaries=True, lemmas=None, pos_dict=None, bigrams=True, trigrams=True): """ :param tokens: a list of strings :param filtered_corpus: a list of lists, containing strings; all utterances containing legitimate words are added to this list, which will contain only the utterances from the corpus that meet the input criteria :param min_length: the minimum number of strings in a clean utterance for it to be considered legitimate :param size: the size of the window around each target word, in which contexts are collected :param boundaries: a boolean indicating whether to consider utterance boundaries as legitimate contexts or not :param lemmas: a list of strings of the same length as tokens; if one is passed, tokens are assumed to be simple strings, not carrying any PoS information, which is taken from the lemmas, which are in turn supposed to consist of a word and a PoS tag, joined by a tilde ('~'); if no lemmas list is passed, tokens are taken to carry PoS tag information, also in the form word~PoS :param pos_dict: a dictionary mapping CHILDES tags to custom ones; default is None, meaning that everything is left unchanged; if a dictionary is passed, all words tagged with tags not in the dictionary are discarded from further processing, and the original tags are replaced with the custom ones :param bigrams: a boolean specifying whether bigrams, i.e. contexts consisting of a lexical item and an empty slot, such as the_X, or X_of, are to be collected :param trigrams: a boolean specifying whether trigrams, i.e. contexts consisting of two lexical items and an empty slot, such as in_the_X, or X_of_you, or the_X_of, are to be collected :return words: a set with the words from the current utterance :return contexts: a set with the contexts from the current utterance """ utterance = clean_utterance(tokens, lemmas=lemmas, pos_dict=pos_dict, boundaries=boundaries) words = set() contexts = set() idx = 1 if boundaries else 0 last_idx = len(utterance) - 1 if boundaries else len(utterance) # if at least one valid word was present in the utterance and survived the filtering stage, collect all possible # contexts from the utterance, as specified by the input granularities if len(utterance) > min_length: filtered_corpus.append(utterance) while idx < last_idx: # using every word as pivot, collect all contexts around a pivot word and store both contexts and words context_window = construct_window(utterance, idx, size) current_contexts = get_ngrams(context_window, bigrams=bigrams, trigrams=trigrams) words.add(utterance[idx]) for context in current_contexts: contexts.add(context) idx += 1 # return the set of unique words and unique contexts derived from the utterance provided as input return words, contexts
def get_useful_contexts(input_corpus, pos_dict=None, k=1, boundaries=True, bigrams=True, trigrams=True, pred=True, div=True, freq=True, averages=True): """ :param input_corpus: the path to a .txt file containing CHILDES transcripts, with one utterance per line and words divided by white spaces. The first element of each utterance is the capitalized label of the speaker, as found in CHILDES. The second element is a dummy word marking the beginning of the utterance, #start; the last element is a dummy word marking the end of the utterance, #end. Each word is paired to its Part-of-Speech tag, the two separated by a tilde, word~PoS. :param pos_dict: a dictionary mapping CHILDES PoS tags to custom tags; any CHILDES tag that doesn't appear as key in the dictionary is assumed to be irrelevant and words labeled with those tags are discarded; the default to None means that every input word is considered :param k: the threshold to determine which contexts are salient: every context whose score is higher than t is considered to be salient. The default is 1. :param boundaries: a boolean indicating whether utterance boundaries are to be considered or not as context :param bigrams: a boolean indicating whether bigrams are to be collected :param trigrams: a boolean indicating whether trigrams are to be collected :param pred: a boolean indicating whether average predictability of contexts given words are a relevant piece of information in deciding about how important a context it :param div: a boolean indicating whether lexical diversity of contexts, i.e. the number of different words that occur in a context, is a relevant piece of information in deciding about how important a context it :param freq: a boolean indicating whether contexts' frequency count is a relevant piece of information in deciding about how important a context it :param averages: a boolean specifying whether frequency, diversity, and predictability values for each context have to be compared to running averages :return contexts: a dictionary mapping contexts to their relevance score """ # set the size of the window around the target word where contexts are collected size = 2 if trigrams else 1 # set the minimum length of a legitimate utterance: 0 if utterance boundaries are not considered and 2 if they are, # since there will always at least the two boundary markers min_length = 2 if boundaries else 0 # read in the corpus and initialize a list where to store utterances that survived the cleaning step # (i.e. utterances that contained at least one legitimate word that is not a boundary marker, if they are considered corpus = read_txt_corpus_file(input_corpus) filtered_corpus = [] words = set() contexts = set() # collect all words and contexts from the corpus, getting rid of PoS tags so that homographs are not disambiguated # if they are tagged differently for line in corpus: # get rid of all utterances uttered by the child and clean child-directed utterances if line[0] != 'CHI': del line[0] w, c = get_words_and_contexts(line, filtered_corpus, min_length, size, boundaries=boundaries, pos_dict=pos_dict, bigrams=bigrams, trigrams=trigrams) for el in w: words.add(strip_pos(el, i=1)) for el in c: contexts.add(strip_pos(el, i=1, context=True)) # map words and contexts to numerical indices words2ids = sort_items(words) contexts2ids = sort_items(contexts) print(strftime("%Y-%m-%d %H:%M:%S") + ": I collected all words and contexts in the input corpus.") print() total_utterances = len(filtered_corpus) check_points = {np.floor(total_utterances / float(100) * n): n for n in np.linspace(5, 100, 20)} # initialize an empty matrix with as many rows as there are words and as many columns as there are contexts in the # input corpus, making sure cells store float and not integers co_occurrences = np.zeros([len(words2ids), len(contexts2ids)]).astype(float) word_frequencies = np.zeros([len(words2ids)]) line_idx = 0 for utterance in filtered_corpus: # set the first and last index of the utterance, depending on whether utterance boundaries are to be considered idx = 1 if boundaries else 0 last_idx = len(utterance) - 1 if boundaries else len(utterance) while idx < last_idx: current_word = utterance[idx] # collect all contexts for the current pivot word context_window = construct_window(utterance, idx, size, splitter='~') current_contexts = get_ngrams(context_window, bigrams=bigrams, trigrams=trigrams) row_id = words2ids[current_word.split('~')[1]] word_frequencies[row_id] += 1 for context in current_contexts: # store the co-occurrence count between the word and context being considered and update # their salience score col_id = contexts2ids[context] co_occurrences[row_id, col_id] += 1 idx += 1 # at every 5% of the input corpus, print progress and store summary statistics: nothing is done with them, but # a plot can be made, or the values returned line_idx += 1 if line_idx in check_points: print('Line ' + str(line_idx) + ' has been processed at ' + str(datetime.now()) + '.') if averages: avg_freq, avg_div, avg_pred = get_averages(co_occurrences, word_frequencies) else: avg_freq, avg_div, avg_pred = [None, None, None] contexts_scores = compute_context_score(co_occurrences, contexts2ids, word_frequencies, pred=pred, div=div, freq=freq, avg_pred=avg_pred, avg_freq=avg_freq, avg_lex_div=avg_div) # only return contexts whose salience score is higher than the threshold t return dict((key, value) for key, value in contexts_scores.items() if value > k)
def create_vector_space(input_corpus, salient_contexts, pos_dict=None, targets='', to_ignore='', boundaries=True, bigrams=True, trigrams=True): """ :param input_corpus: the path to a .txt file containing CHILDES transcripts, with one utterance per line and words divided by white spaces. The first element of each utterance is the capitalized label of the speaker, as found in CHILDES. The second element is a dummy word marking the beginning of the utterance, #start; the last element is a dummy word marking the end of the utterance, #end. Each word is paired to its Part-of-Speech tag, the two separated by a tilde, word~PoS. :param salient_contexts: a set containing the contexts determined to be salient :param pos_dict: a dictionary mapping CHILDES PoS tags to custom tags; any CHILDES tag that doesn't appear as key in the dictionary is assumed to be irrelevant and words labeled with those tags are discarded; the default to None means that every word is considered :param targets: the path to a .txt file containing the target words, i.e. the words for which co-occurrence counts will be collected. The file contains one word per line, with words joined to the corresponding PoS tag by a tilde. By default, no file is passed and the function considers all words as targets. :param to_ignore: the path to a .txt file containing the words to be ignored. The file contains one word per line, with words joined to the corresponding PoS tag by a tilde. By default, no file is passed and the function considers all words, without ignoring any. :param boundaries: a boolean indicating whether utterance boundaries are to be considered or not as context :param bigrams: a boolean indicating whether bigrams are to be collected :param trigrams: a boolean indicating whether trigrams are to be collected :return co_occurrences: a dictionary of dictionaries, mapping words to context to the word-context co-occurrence count in the input corpus :return useless_contexts: a set containing the contexts from the input dictionary that never occurred in the corpus or that only occurred with one word (either because they only occur once or because they occur multiple times but always with the same word type) :return unused_words: a set containing the words in the input corpus that occur at least once in at least one of the contexts provided in the input dictionary contexts """ # read in the sets of words to be used as targets and/or those to be avoided, if provided target_words = read_targets(targets, pos_dict=pos_dict) if targets else set() skip_words = read_targets(to_ignore, pos_dict=pos_dict) if to_ignore else set() # set the size of the window around the target word where contexts are collected size = 2 if trigrams else 1 # set the minimum length of a legitimate utterance: 0 if utterance boundaries are not considered and 2 if they are, # since there will always at least the two boundary markers min_length = 2 if boundaries else 0 # read in the corpus and initialize a list where to store utterances that survived the cleaning step # (i.e. utterances that contained at least one legitimate word that is not a boundary marker, if they are considered corpus = read_txt_corpus_file(input_corpus) filtered_corpus = [] words = set() # collect all words from the corpus, preserving the PoS tags, since we want to be able to tell whether we categorize # each homograph correctly for line in corpus: # get rid of all utterances uttered by the child and clean child-directed utterances if line[0] != 'CHI': del line[0] w, c = get_words_and_contexts(line, filtered_corpus, min_length, size, pos_dict=pos_dict, bigrams=bigrams, trigrams=trigrams) words = words.union(w) # get the target words for which co-occurrence counts needs to be collected, depending on the set of target words or # words to be ignored passed to the function targets = set() for w in words: if target_words: if w in target_words and w not in skip_words: targets.add(w) else: if w not in skip_words: targets.add(w) # map words and contexts (provided in the input) to numerical indices targets2ids = sort_items(targets) contexts2ids = sort_items(salient_contexts) print( strftime("%Y-%m-%d %H:%M:%S") + ": I collected all words in the corpus.") print() """ At this point we have two dictionaries: - one contains all the words collected in the corpus passed as input to this function and filtered according to the words in the set of target words and in the set of words to be ignored; surviving words are sorted according to the PoS tag and then according to the word form, stored as keys in the form PoStag~word and mapped to numerical indices. - one contains the contexts passed as input to the function (the contexts that were deemed salient by the function learning_contexts), from which all information about PoS tags has been stripped away; these contexts are sorted and mapped to numerical indices The numerical indices will point to rows (words) and columns (contexts) of a 2d NumPy array that will store word-context co-occurrence counts. Contexts are sorted because columns have to be aligned across training and test spaces. Words are sorted so that words from the same PoS tags are in neighbouring rows and make the visualization of further steps easier to grasp """ total_utterances = len(filtered_corpus) check_points = { np.floor(total_utterances / float(100) * n): n for n in np.linspace(5, 100, 20) } co_occurrences = np.zeros([len(targets2ids), len(salient_contexts)]).astype(float) line_idx = 0 for utterance in filtered_corpus: idx = 1 if boundaries else 0 last_idx = len(utterance) - 1 if boundaries else len(utterance) while idx < last_idx: current_word = utterance[idx] if current_word in targets2ids: # only process the word if it is among the targets (i.e. either it occurs in the set of target words or # it doesn't occur in the set of words to be ignored, as determined previously) w = construct_window(utterance, idx, size, splitter='~') curr_contexts = get_ngrams(w, bigrams=bigrams, trigrams=trigrams) row_id = targets2ids[current_word] for context in curr_contexts: if context in salient_contexts: # only keep track of co-occurrences between target words and salient contexts col_id = contexts2ids[context] co_occurrences[row_id, col_id] += 1 # move on through the sentence being processed idx += 1 line_idx += 1 if line_idx in check_points: print('Line ' + str(line_idx) + ' has been processed at ' + str(datetime.now()) + '.') # get the contexts with lexical diversity lower than 2 (thus salient contexts that never occurred in the input # corpus or contexts that only occurred with one word, being useless to any categorization task) # the 2 in the function call is the minimum lexical diversity of a context to be considered useful # the rows=False argument indicates that the function has to work over columns # it returns a set of strings containing the contexts that don't meet the criterion of minimum lexical diversity useless_contexts = diversity_cutoff(co_occurrences, 2, contexts2ids, rows=False) # create a vector of booleans, with as many values as there are rows in the co-occurrence matrix: this vector is # True on indices corresponding to rows in the co-occurrence matrix with more than 1 non-zero cell and False # everywhere else. This vector is used to get rid of 'empty' lines from the co-occurrence matrix and identify # indices corresponding to words that never occurred with any of the salient contexts, and then the words # corresponding to these indices. Re-align the word-index mapping to the new matrix, taking advantage of the # alphabetical order of the indices. mask = (co_occurrences > 0).sum(1) > 0 unused_indices = np.where(mask == False)[0] co_occurrences = co_occurrences[mask, :] clean_targets2ids = {} unused_words = set() # loop through the word-index pairs from the smallest index to the largest; if the index is among the unused ones, # add the corresponding word to the set of unused word; otherwise assign it a new progressive index that will match # the row of the new co-occurrence matrix (this works because the order of the retained rows in the matrix is # preserved, so lines at the top of the original matrix will also be at the top of the cleaned matrix and so on) new_idx = 0 for w, i in sorted(targets2ids.items(), key=operator.itemgetter(1)): if i in unused_indices: unused_words.add(w) else: clean_targets2ids[w] = new_idx new_idx += 1 return co_occurrences, useless_contexts, unused_words, clean_targets2ids, contexts2ids
def collect_contexts(input_corpus, training_perc, pos_dict, bigrams=True, trigrams=True): """ :param input_corpus: a .json file containing transcripts of child-caregiver interactions extracted from the CHILDES database. The json file consists of two lists of lists, of the same length, both contain utterances but encoded differently. The first encodes each utterance as a list of tokens; the second encodes each utterance as a list of lemmas and Part-of-Speech tags, joined by a tilde ('~'). :param training_perc: a number indicating the percentage of the input corpus to be used for training :param pos_dict: a dictionary mapping CHILDES Parts-of-Speech tags to custom tags :param bigrams: a boolean indicating whether bigrams are to be collected :param trigrams: a boolean indicating whether trigrams are to be collected :return co_occurrences: a NumPy 2d array, where each row is a word and each column is a context. Each cell contains an integer specifying how many times a word and a context co-occurred in the input corpus :return context_ids: a dictionary mapping strings denoting contexts to their column index in the co-occurrence matrix :return word_ids: a dictionary mapping strings denoting words to their row index in the co-occurrence matrix This function collects all distributional contexts satisfying the input criterion scanning the input corpus. """ words = set() contexts = set() # get the cut-off point where to stop training corpus = json.load(open(input_corpus, 'r+')) total_utterances = len(corpus[0]) cutoff = np.floor(total_utterances / float(100) * training_perc) print("Considering utterances for training set until utterance number %d" % cutoff) # set the size of the window in which contexts are collected size = 2 if trigrams else 1 # initialize a list where cleaned utterances will be stored for re-use filtered_corpus = [] # initialize a counter to keep track of how many lines have been processed n_line = 0 # since utterance boundaries are always added to utterances, a non-empty utterance has more than 2 elements min_length = 2 # process lines until the cut-off is reached while n_line < cutoff: # filter the current utterance removing all words labeled with PoS tags that need to be discarded # the clean_utterance function automatically adds beginning and end of utterance markers to the utterance, # which is return as a list of strings tokens = corpus[0][n_line] lemmas = corpus[1][n_line] w, c = get_words_and_contexts(tokens, filtered_corpus, min_length, size, lemmas=lemmas, pos_dict=pos_dict, bigrams=bigrams, trigrams=trigrams) words = words.union(w) contexts = contexts.union(c) n_line += 1 words2ids = sort_items(words) contexts2ids = sort_items(contexts) print( strftime("%Y-%m-%d %H:%M:%S") + ": I collected all words and contexts in the test corpus.") print() co_occurrences = np.zeros([len(words2ids), len(contexts2ids)]) total_utterances = len(filtered_corpus) check_points = { np.floor(total_utterances / float(100) * n): n for n in np.linspace(5, 100, 20) } word_frequencies = np.zeros(len(words2ids)) n_line = 0 for u in filtered_corpus: idx = 1 last_idx = len(u) - 1 while idx < last_idx: # using all valid words as pivots, collect all possible contexts and keep track of word-context # co-occurrences, updating the counts current_word = u[idx] context_window = utterance.construct_window(u, idx, size) current_contexts = utterance.get_ngrams(context_window, bigrams=bigrams, trigrams=trigrams) row_idx = words2ids[current_word] word_frequencies[row_idx] += 1 for context in current_contexts: col_idx = contexts2ids[context] co_occurrences[row_idx, col_idx] += 1 idx += 1 n_line += 1 # print progress if n_line in check_points: print( strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of the utterances allocated as training corpus has been processed." % check_points[n_line]) return co_occurrences, contexts2ids, words2ids, word_frequencies
def create_test_space(input_corpus, test_perc, contexts, pos_dict, bigrams=True, trigrams=True): """ :param input_corpus: the same corpus used for training, in the same .json format (see the documentation to the function collect_contexts for further details :param test_perc: a number indicating the percentage of the input corpus to be used as test set - ideally this would be 100 - the training percentage, but different values can be chosen, However, we stress that it is preferable to avoid any overlap between training and test material :param contexts: a dictionary containing all the contexts collected during training, mapped to the column index each context has in the training space :param pos_dict: a dictionary mapping CHILDES Parts-of-Speech tags to custom tags (the same that was used as input to the function collect_contexts :param bigrams: a boolean indicating whether bigrams are to be collected :param trigrams: a boolean indicating whether trigrams are to be collected :return co_occurrences: a NumPY 2d array, where rows are words, columsn are distributional contexts, and cells contain integers indicating how many times a word co-occurred with a context in the input corpus :return word_ids a dictionary mapping words to numerical indices, indicating the corresponding row in the co-occurrence matrix :return word_freqs: a dictionary mapping words to their frequency count as computed from the test set """ # initialize three dictionaries to keep track of word and context frequency counts, and of the co-occurrences # between them co_occurrences = np.zeros([0, len(contexts)]) word_ids = {} word_freqs = Counter() last_word = 0 # get the cut-off point where to start considering utterances for the test set corpus = json.load(open(input_corpus, 'r+')) total_utterances = len(corpus[0]) cutoff = total_utterances - np.floor(total_utterances / 100 * test_perc) # get the indices of utterances marking the 5, 10, 15, ... per cent of the input in order to show progress check_points = { np.floor((total_utterances - cutoff) / 100 * n) + cutoff: n for n in np.linspace(5, 100, 20) } # set the size of the window in which contexts are collected size = 2 if trigrams else 1 # start considering utterances from the cut-off point computed using the percentage provided as input nline = int(cutoff) print("Considering utterances for test set from utterance number %d" % cutoff) while nline < total_utterances: # filter the current utterance removing all words labeled with PoS tags that need to be discarded tokens = corpus[0][nline] lemmas = corpus[1][nline] words = utterance.clean_utterance(tokens, lemmas=lemmas, pos_dict=pos_dict) # if at least one valid word was present in the utterance and survived the filtering step, collect all possible # contexts from the utterance, as specified by the input granularities if len(words) > 1: words.append('#end~bound') last_idx = len(words) - 1 idx = 1 # first and last element are dummy words for sentence boundaries while idx < last_idx: # using all words as pivots, collect all possible contexts, check which ones were also collected # during training and keep track of word-context co-occurrences involving only this subset of contexts, # updating the counts context_window = utterance.construct_window(words, idx, size) current_contexts = utterance.get_ngrams(context_window, bigrams=bigrams, trigrams=trigrams) target_word = words[idx] word_freqs[target_word] += 1 # the frequency count is incremented once. However, the word is counted as occurring with all the # contexts of the window, so it may be the case that a word has a higher diversity count than frequency # count. This is not an error, but the result of harvesting more than one context for every occurrence # of a word. if target_word not in word_ids: word_ids[target_word] = last_word last_word += 1 new_row = np.zeros([1, co_occurrences.shape[1]]) co_occurrences = np.vstack([co_occurrences, new_row]) for context in current_contexts: if context in contexts: row_idx = word_ids[target_word] col_idx = contexts[context] co_occurrences[row_idx, col_idx] += 1 idx += 1 # print progress if nline in check_points: print( strftime("%Y-%m-%d %H:%M:%S") + ": %d%% of the utterances allocated as test set has been processed." % check_points[nline]) nline += 1 return co_occurrences, word_ids, word_freqs