Exemplo n.º 1
0
def get_words_and_contexts(tokens,
                           filtered_corpus,
                           min_length,
                           size,
                           boundaries=True,
                           lemmas=None,
                           pos_dict=None,
                           bigrams=True,
                           trigrams=True):
    """
    :param tokens:          a list of strings
    :param filtered_corpus: a list of lists, containing strings; all utterances containing legitimate words are added to
                            this list, which will contain only the utterances from the corpus that meet the input
                            criteria
    :param min_length:      the minimum number of strings in a clean utterance for it to be considered legitimate
    :param size:            the size of the window around each target word, in which contexts are collected
    :param boundaries:      a boolean indicating whether to consider utterance boundaries as legitimate contexts or not
    :param lemmas:          a list of strings of the same length as tokens; if one is passed, tokens are assumed to be
                            simple strings, not carrying any PoS information, which is taken from the lemmas, which are
                            in turn supposed to consist of a word and a PoS tag, joined by a tilde ('~'); if no lemmas
                            list is passed, tokens are taken to carry PoS tag information, also in the form word~PoS
    :param pos_dict:        a dictionary mapping CHILDES tags to custom ones; default is None, meaning that everything
                            is left unchanged; if a dictionary is passed, all words tagged with tags not in the
                            dictionary are discarded from further processing, and the original tags are replaced with
                            the custom ones
    :param bigrams:         a boolean specifying whether bigrams, i.e. contexts consisting of a lexical item and an
                            empty slot, such as the_X, or X_of, are to be collected
    :param trigrams:        a boolean specifying whether trigrams, i.e. contexts consisting of two lexical items and an
                            empty slot, such as in_the_X, or X_of_you, or the_X_of, are to be collected
    :return words:          a set with the words from the current utterance
    :return contexts:       a set with the contexts from the current utterance
    """

    utterance = clean_utterance(tokens,
                                lemmas=lemmas,
                                pos_dict=pos_dict,
                                boundaries=boundaries)

    words = set()
    contexts = set()
    idx = 1 if boundaries else 0
    last_idx = len(utterance) - 1 if boundaries else len(utterance)

    # if at least one valid word was present in the utterance and survived the filtering stage, collect all possible
    # contexts from the utterance, as specified by the input granularities
    if len(utterance) > min_length:
        filtered_corpus.append(utterance)
        while idx < last_idx:
            # using every word as pivot, collect all contexts around a pivot word and store both contexts and words
            context_window = construct_window(utterance, idx, size)
            current_contexts = get_ngrams(context_window,
                                          bigrams=bigrams,
                                          trigrams=trigrams)
            words.add(utterance[idx])
            for context in current_contexts:
                contexts.add(context)
            idx += 1

    # return the set of unique words and unique contexts derived from the utterance provided as input
    return words, contexts
def get_useful_contexts(input_corpus, pos_dict=None, k=1, boundaries=True, bigrams=True, trigrams=True, pred=True,
                        div=True, freq=True, averages=True):

    """
    :param input_corpus:    the path to a .txt file containing CHILDES transcripts, with one utterance per line and
                            words divided by white spaces. The first element of each utterance is the capitalized label
                            of the speaker, as found in CHILDES. The second element is a dummy word marking the
                            beginning of the utterance, #start; the last element is a dummy word marking the end of the
                            utterance, #end. Each word is paired to its Part-of-Speech tag, the two separated by a
                            tilde, word~PoS.
    :param pos_dict:        a dictionary mapping CHILDES PoS tags to custom tags; any CHILDES tag that doesn't appear as
                            key in the dictionary is assumed to be irrelevant and words labeled with those tags are
                            discarded; the default to None means that every input word is considered
    :param k:               the threshold to determine which contexts are salient: every context whose score is higher
                            than t is considered to be salient. The default is 1.
    :param boundaries:      a boolean indicating whether utterance boundaries are to be considered or not as context
    :param bigrams:         a boolean indicating whether bigrams are to be collected
    :param trigrams:        a boolean indicating whether trigrams are to be collected
    :param pred:            a boolean indicating whether average predictability of contexts given words are
                            a relevant piece of information in deciding about how important a context it
    :param div:             a boolean indicating whether lexical diversity of contexts, i.e. the number of different
                            words that occur in a context, is a relevant piece of information in deciding about how
                            important a context it
    :param freq:            a boolean indicating whether contexts' frequency count is a relevant piece of information in
                            deciding about how important a context it
    :param averages:        a boolean specifying whether frequency, diversity, and predictability values for each
                            context have to be compared to running averages
    :return contexts:       a dictionary mapping contexts to their relevance score
    """

    # set the size of the window around the target word where contexts are collected
    size = 2 if trigrams else 1

    # set the minimum length of a legitimate utterance: 0 if utterance boundaries are not considered and 2 if they are,
    # since there will always at least the two boundary markers
    min_length = 2 if boundaries else 0

    # read in the corpus and initialize a list where to store utterances that survived the cleaning step
    # (i.e. utterances that contained at least one legitimate word that is not a boundary marker, if they are considered
    corpus = read_txt_corpus_file(input_corpus)
    filtered_corpus = []
    words = set()
    contexts = set()

    # collect all words and contexts from the corpus, getting rid of PoS tags so that homographs are not disambiguated
    # if they are tagged differently
    for line in corpus:
        # get rid of all utterances uttered by the child and clean child-directed utterances
        if line[0] != 'CHI':
            del line[0]
            w, c = get_words_and_contexts(line, filtered_corpus, min_length, size, boundaries=boundaries,
                                          pos_dict=pos_dict, bigrams=bigrams, trigrams=trigrams)
            for el in w:
                words.add(strip_pos(el, i=1))
            for el in c:
                contexts.add(strip_pos(el, i=1, context=True))

    # map words and contexts to numerical indices
    words2ids = sort_items(words)
    contexts2ids = sort_items(contexts)
    print(strftime("%Y-%m-%d %H:%M:%S") + ": I collected all words and contexts in the input corpus.")
    print()

    total_utterances = len(filtered_corpus)
    check_points = {np.floor(total_utterances / float(100) * n): n for n in np.linspace(5, 100, 20)}

    # initialize an empty matrix with as many rows as there are words and as many columns as there are contexts in the
    # input corpus, making sure cells store float and not integers
    co_occurrences = np.zeros([len(words2ids), len(contexts2ids)]).astype(float)
    word_frequencies = np.zeros([len(words2ids)])

    line_idx = 0
    for utterance in filtered_corpus:
        # set the first and last index of the utterance, depending on whether utterance boundaries are to be considered
        idx = 1 if boundaries else 0
        last_idx = len(utterance) - 1 if boundaries else len(utterance)
        while idx < last_idx:
            current_word = utterance[idx]
            # collect all contexts for the current pivot word
            context_window = construct_window(utterance, idx, size, splitter='~')
            current_contexts = get_ngrams(context_window, bigrams=bigrams, trigrams=trigrams)
            row_id = words2ids[current_word.split('~')[1]]
            word_frequencies[row_id] += 1
            for context in current_contexts:
                # store the co-occurrence count between the word and context being considered and update
                # their salience score
                col_id = contexts2ids[context]
                co_occurrences[row_id, col_id] += 1
            idx += 1

        # at every 5% of the input corpus, print progress and store summary statistics: nothing is done with them, but
        # a plot can be made, or the values returned
        line_idx += 1
        if line_idx in check_points:
            print('Line ' + str(line_idx) + ' has been processed at ' + str(datetime.now()) + '.')

    if averages:
        avg_freq, avg_div, avg_pred = get_averages(co_occurrences, word_frequencies)
    else:
        avg_freq, avg_div, avg_pred = [None, None, None]

    contexts_scores = compute_context_score(co_occurrences, contexts2ids, word_frequencies,
                                            pred=pred, div=div, freq=freq,
                                            avg_pred=avg_pred, avg_freq=avg_freq, avg_lex_div=avg_div)

    # only return contexts whose salience score is higher than the threshold t
    return dict((key, value) for key, value in contexts_scores.items() if value > k)
def create_vector_space(input_corpus,
                        salient_contexts,
                        pos_dict=None,
                        targets='',
                        to_ignore='',
                        boundaries=True,
                        bigrams=True,
                        trigrams=True):
    """
    :param input_corpus:        the path to a .txt file containing CHILDES transcripts, with one utterance per line and
                                words divided by white spaces. The first element of each utterance is the capitalized
                                label of the speaker, as found in CHILDES. The second element is a dummy word marking
                                the beginning of the utterance, #start; the last element is a dummy word marking the end
                                of the utterance, #end. Each word is paired to its Part-of-Speech tag, the two separated
                                by a tilde, word~PoS.
    :param salient_contexts:    a set containing the contexts determined to be salient
    :param pos_dict:            a dictionary mapping CHILDES PoS tags to custom tags; any CHILDES tag that doesn't
                                appear as key in the dictionary is assumed to be irrelevant and words labeled with those
                                tags are discarded; the default to None means that every word is considered
    :param targets:             the path to a .txt file containing the target words, i.e. the words for which
                                co-occurrence counts will be collected. The file contains one word per line, with words
                                joined to the corresponding PoS tag by a tilde. By default, no file is passed and the
                                function considers all words as targets.
    :param to_ignore:           the path to a .txt file containing the words to be ignored. The file contains one word
                                per line, with words joined to the corresponding PoS tag by a tilde. By default, no file
                                is passed and the function considers all words, without ignoring any.
    :param boundaries:          a boolean indicating whether utterance boundaries are to be considered or not as context
    :param bigrams:             a boolean indicating whether bigrams are to be collected
    :param trigrams:            a boolean indicating whether trigrams are to be collected
    :return co_occurrences:     a dictionary of dictionaries, mapping words to context to the word-context co-occurrence
                                count in the input corpus
    :return useless_contexts:   a set containing the contexts from the input dictionary that never occurred in the
                                corpus or that only occurred with one word (either because they only occur once or
                                because they occur multiple times but always with the same word type)
    :return unused_words:       a set containing the words in the input corpus that occur at least once in at least one
                                of the contexts provided in the input dictionary contexts
    """

    # read in the sets of words to be used as targets and/or those to be avoided, if provided
    target_words = read_targets(targets,
                                pos_dict=pos_dict) if targets else set()
    skip_words = read_targets(to_ignore,
                              pos_dict=pos_dict) if to_ignore else set()

    # set the size of the window around the target word where contexts are collected
    size = 2 if trigrams else 1

    # set the minimum length of a legitimate utterance: 0 if utterance boundaries are not considered and 2 if they are,
    # since there will always at least the two boundary markers
    min_length = 2 if boundaries else 0

    # read in the corpus and initialize a list where to store utterances that survived the cleaning step
    # (i.e. utterances that contained at least one legitimate word that is not a boundary marker, if they are considered
    corpus = read_txt_corpus_file(input_corpus)
    filtered_corpus = []
    words = set()

    # collect all words from the corpus, preserving the PoS tags, since we want to be able to tell whether we categorize
    # each homograph correctly
    for line in corpus:
        # get rid of all utterances uttered by the child and clean child-directed utterances
        if line[0] != 'CHI':
            del line[0]
            w, c = get_words_and_contexts(line,
                                          filtered_corpus,
                                          min_length,
                                          size,
                                          pos_dict=pos_dict,
                                          bigrams=bigrams,
                                          trigrams=trigrams)
            words = words.union(w)

    # get the target words for which co-occurrence counts needs to be collected, depending on the set of target words or
    # words to be ignored passed to the function
    targets = set()
    for w in words:
        if target_words:
            if w in target_words and w not in skip_words:
                targets.add(w)
        else:
            if w not in skip_words:
                targets.add(w)

    # map words and contexts (provided in the input) to numerical indices
    targets2ids = sort_items(targets)
    contexts2ids = sort_items(salient_contexts)
    print(
        strftime("%Y-%m-%d %H:%M:%S") +
        ": I collected all words in the corpus.")
    print()
    """
    At this point we have two dictionaries:
     - one contains all the words collected in the corpus passed as input to this function and filtered according to the 
       words in the set of target words and in the set of words to be ignored; surviving words are sorted according to
       the PoS tag and then according to the word form, stored as keys in the form PoStag~word and mapped to numerical 
       indices.
     - one contains the contexts passed as input to the function (the contexts that were deemed salient by the 
       function learning_contexts), from which all information about PoS tags has been stripped away; these contexts
       are sorted and mapped to numerical indices
    The numerical indices will point to rows (words) and columns (contexts) of a 2d NumPy array that will store 
    word-context co-occurrence counts. Contexts are sorted because columns have to be aligned across training and test
    spaces. Words are sorted so that words from the same PoS tags are in neighbouring rows and make the visualization
    of further steps easier to grasp
    """

    total_utterances = len(filtered_corpus)
    check_points = {
        np.floor(total_utterances / float(100) * n): n
        for n in np.linspace(5, 100, 20)
    }
    co_occurrences = np.zeros([len(targets2ids),
                               len(salient_contexts)]).astype(float)
    line_idx = 0

    for utterance in filtered_corpus:
        idx = 1 if boundaries else 0
        last_idx = len(utterance) - 1 if boundaries else len(utterance)
        while idx < last_idx:
            current_word = utterance[idx]
            if current_word in targets2ids:
                # only process the word if it is among the targets (i.e. either it occurs in the set of target words or
                # it doesn't occur in the set of words to be ignored, as determined previously)
                w = construct_window(utterance, idx, size, splitter='~')
                curr_contexts = get_ngrams(w,
                                           bigrams=bigrams,
                                           trigrams=trigrams)
                row_id = targets2ids[current_word]
                for context in curr_contexts:
                    if context in salient_contexts:
                        # only keep track of co-occurrences between target words and salient contexts
                        col_id = contexts2ids[context]
                        co_occurrences[row_id, col_id] += 1
            # move on through the sentence being processed
            idx += 1

        line_idx += 1
        if line_idx in check_points:
            print('Line ' + str(line_idx) + ' has been processed at ' +
                  str(datetime.now()) + '.')

    # get the contexts with lexical diversity lower than 2 (thus salient contexts that never occurred in the input
    # corpus or contexts that only occurred with one word, being useless to any categorization task)
    # the 2 in the function call is the minimum lexical diversity of a context to be considered useful
    # the rows=False argument indicates that the function has to work over columns
    # it returns a set of strings containing the contexts that don't meet the criterion of minimum lexical diversity
    useless_contexts = diversity_cutoff(co_occurrences,
                                        2,
                                        contexts2ids,
                                        rows=False)

    # create a vector of booleans, with as many values as there are rows in the co-occurrence matrix: this vector is
    # True on indices corresponding to rows in the co-occurrence matrix with more than 1 non-zero cell and False
    # everywhere else. This vector is used to get rid of 'empty' lines from the co-occurrence matrix and identify
    # indices corresponding to words that never occurred with any of the salient contexts, and then the words
    # corresponding to these indices. Re-align the word-index mapping to the new matrix, taking advantage of the
    # alphabetical order of the indices.
    mask = (co_occurrences > 0).sum(1) > 0
    unused_indices = np.where(mask == False)[0]
    co_occurrences = co_occurrences[mask, :]
    clean_targets2ids = {}
    unused_words = set()

    # loop through the word-index pairs from the smallest index to the largest; if the index is among the unused ones,
    # add the corresponding word to the set of unused word; otherwise assign it a new progressive index that will match
    # the row of the new co-occurrence matrix (this works because the order of the retained rows in the matrix is
    # preserved, so lines at the top of the original matrix will also be at the top of the cleaned matrix and so on)
    new_idx = 0
    for w, i in sorted(targets2ids.items(), key=operator.itemgetter(1)):
        if i in unused_indices:
            unused_words.add(w)
        else:
            clean_targets2ids[w] = new_idx
            new_idx += 1

    return co_occurrences, useless_contexts, unused_words, clean_targets2ids, contexts2ids
def collect_contexts(input_corpus,
                     training_perc,
                     pos_dict,
                     bigrams=True,
                     trigrams=True):
    """
    :param input_corpus:    a .json file containing transcripts of child-caregiver interactions extracted from the
                            CHILDES database. The json file consists of two lists of lists, of the same length, both
                            contain utterances but encoded differently. The first encodes each utterance as a list
                            of tokens; the second encodes each utterance as a list of lemmas and Part-of-Speech
                            tags, joined by a tilde ('~').
    :param training_perc:   a number indicating the percentage of the input corpus to be used for training
    :param pos_dict:        a dictionary mapping CHILDES Parts-of-Speech tags to custom tags
    :param bigrams:         a boolean indicating whether bigrams are to be collected
    :param trigrams:        a boolean indicating whether trigrams are to be collected
    :return co_occurrences: a NumPy 2d array, where each row is a word and each column is a context. Each cell contains
                            an integer specifying how many times a word and a context co-occurred in the input corpus
    :return context_ids:    a dictionary mapping strings denoting contexts to their column index in the co-occurrence
                            matrix
    :return word_ids:       a dictionary mapping strings denoting words to their row index in the co-occurrence matrix

    This function collects all distributional contexts satisfying the input criterion scanning the input corpus.
    """

    words = set()
    contexts = set()

    # get the cut-off point where to stop training
    corpus = json.load(open(input_corpus, 'r+'))
    total_utterances = len(corpus[0])
    cutoff = np.floor(total_utterances / float(100) * training_perc)
    print("Considering utterances for training set until utterance number %d" %
          cutoff)

    # set the size of the window in which contexts are collected
    size = 2 if trigrams else 1
    # initialize a list where cleaned utterances will be stored for re-use
    filtered_corpus = []
    # initialize a counter to keep track of how many lines have been processed
    n_line = 0
    # since utterance boundaries are always added to utterances, a non-empty utterance has more than 2 elements
    min_length = 2

    # process lines until the cut-off is reached
    while n_line < cutoff:

        # filter the current utterance removing all words labeled with PoS tags that need to be discarded
        # the clean_utterance function automatically adds beginning and end of utterance markers to the utterance,
        # which is return as a list of strings
        tokens = corpus[0][n_line]
        lemmas = corpus[1][n_line]
        w, c = get_words_and_contexts(tokens,
                                      filtered_corpus,
                                      min_length,
                                      size,
                                      lemmas=lemmas,
                                      pos_dict=pos_dict,
                                      bigrams=bigrams,
                                      trigrams=trigrams)
        words = words.union(w)
        contexts = contexts.union(c)
        n_line += 1

    words2ids = sort_items(words)
    contexts2ids = sort_items(contexts)

    print(
        strftime("%Y-%m-%d %H:%M:%S") +
        ": I collected all words and contexts in the test corpus.")
    print()

    co_occurrences = np.zeros([len(words2ids), len(contexts2ids)])

    total_utterances = len(filtered_corpus)
    check_points = {
        np.floor(total_utterances / float(100) * n): n
        for n in np.linspace(5, 100, 20)
    }

    word_frequencies = np.zeros(len(words2ids))

    n_line = 0
    for u in filtered_corpus:
        idx = 1
        last_idx = len(u) - 1
        while idx < last_idx:
            # using all valid words as pivots, collect all possible contexts and keep track of word-context
            # co-occurrences, updating the counts
            current_word = u[idx]
            context_window = utterance.construct_window(u, idx, size)
            current_contexts = utterance.get_ngrams(context_window,
                                                    bigrams=bigrams,
                                                    trigrams=trigrams)
            row_idx = words2ids[current_word]
            word_frequencies[row_idx] += 1
            for context in current_contexts:
                col_idx = contexts2ids[context]
                co_occurrences[row_idx, col_idx] += 1
            idx += 1
        n_line += 1

        # print progress
        if n_line in check_points:
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": %d%% of the utterances allocated as training corpus has been processed."
                % check_points[n_line])

    return co_occurrences, contexts2ids, words2ids, word_frequencies
Exemplo n.º 5
0
def create_test_space(input_corpus,
                      test_perc,
                      contexts,
                      pos_dict,
                      bigrams=True,
                      trigrams=True):
    """
    :param input_corpus:            the same corpus used for training, in the same .json format (see the documentation
                                    to the function collect_contexts for further details
    :param test_perc:               a number indicating the percentage of the input corpus to be used as test set -
                                    ideally this would be 100 - the training percentage, but different values can be
                                    chosen, However, we stress that it is preferable to avoid any overlap between
                                    training and test material
    :param contexts:                a dictionary containing all the contexts collected during training, mapped to the
                                    column index each context has in the training space
    :param pos_dict:                a dictionary mapping CHILDES Parts-of-Speech tags to custom tags (the same that was
                                    used as input to the function collect_contexts
    :param bigrams:                 a boolean indicating whether bigrams are to be collected
    :param trigrams:                a boolean indicating whether trigrams are to be collected
    :return co_occurrences:         a NumPY 2d array, where rows are words, columsn are distributional contexts, and
                                    cells contain integers indicating how many times a word co-occurred with a context
                                    in the input corpus
    :return word_ids                a dictionary mapping words to numerical indices, indicating the corresponding row
                                    in the co-occurrence matrix
    :return word_freqs:             a dictionary mapping words to their frequency count as computed from the test set
    """

    # initialize three dictionaries to keep track of word and context frequency counts, and of the co-occurrences
    # between them
    co_occurrences = np.zeros([0, len(contexts)])
    word_ids = {}
    word_freqs = Counter()
    last_word = 0

    # get the cut-off point where to start considering utterances for the test set
    corpus = json.load(open(input_corpus, 'r+'))
    total_utterances = len(corpus[0])
    cutoff = total_utterances - np.floor(total_utterances / 100 * test_perc)

    # get the indices of utterances marking the 5, 10, 15, ... per cent of the input in order to show progress
    check_points = {
        np.floor((total_utterances - cutoff) / 100 * n) + cutoff: n
        for n in np.linspace(5, 100, 20)
    }

    # set the size of the window in which contexts are collected
    size = 2 if trigrams else 1

    # start considering utterances from the cut-off point computed using the percentage provided as input
    nline = int(cutoff)
    print("Considering utterances for test set from utterance number %d" %
          cutoff)

    while nline < total_utterances:

        # filter the current utterance removing all words labeled with PoS tags that need to be discarded
        tokens = corpus[0][nline]
        lemmas = corpus[1][nline]
        words = utterance.clean_utterance(tokens,
                                          lemmas=lemmas,
                                          pos_dict=pos_dict)

        # if at least one valid word was present in the utterance and survived the filtering step, collect all possible
        # contexts from the utterance, as specified by the input granularities
        if len(words) > 1:
            words.append('#end~bound')
            last_idx = len(words) - 1
            idx = 1
            # first and last element are dummy words for sentence boundaries
            while idx < last_idx:
                # using all words as pivots, collect all possible contexts, check which ones were also collected
                # during training and keep track of word-context co-occurrences involving only this subset of contexts,
                # updating the counts
                context_window = utterance.construct_window(words, idx, size)
                current_contexts = utterance.get_ngrams(context_window,
                                                        bigrams=bigrams,
                                                        trigrams=trigrams)

                target_word = words[idx]
                word_freqs[target_word] += 1
                # the frequency count is incremented once. However, the word is counted as occurring with all the
                # contexts of the window, so it may be the case that a word has a higher diversity count than frequency
                # count. This is not an error, but the result of harvesting more than one context for every occurrence
                # of a word.

                if target_word not in word_ids:
                    word_ids[target_word] = last_word
                    last_word += 1
                    new_row = np.zeros([1, co_occurrences.shape[1]])
                    co_occurrences = np.vstack([co_occurrences, new_row])

                for context in current_contexts:
                    if context in contexts:
                        row_idx = word_ids[target_word]
                        col_idx = contexts[context]
                        co_occurrences[row_idx, col_idx] += 1
                idx += 1

        # print progress
        if nline in check_points:
            print(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": %d%% of the utterances allocated as test set has been processed."
                % check_points[nline])

        nline += 1

    return co_occurrences, word_ids, word_freqs