def test_convert_text_to_tokens():
    train = ["Here are some strings", 
             "Some not so interesting strings!", 
             "Let's make ThEm MORe Intersting?",
             "Yes you can :)",
             "Coolio <::>"]
    
    embedding_name = "glove.6B.50d"

    vocab = func.build_vocab(train, embedding_name, save=False)

    custom_vocab = {
        "however" : 50,
        "going" : 51
    }

    sample_data = ["Let's make ThEm MORe Intersting?",
                   "Some not so interesting strings!", 
                   "However, this one is going to have lots of <unk>s"]
        
    tokenized_data = [[17, 6, 18, 22, 19, 16, 10], 
                      [3, 20, 21, 15, 4, 5], 
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 9, 0]]

    assert func.convert_text_to_tokens(sample_data, vocab, func.tokenize) == tokenized_data

    tokenized_data = [[17, 6, 18, 22, 19, 16, 10], 
                      [3, 20, 21, 15, 4, 5], 
                      [50, 0, 0, 0, 0, 51, 0, 0, 0, 0, 8, 0, 9, 0]]
    
    assert func.convert_text_to_tokens(sample_data, vocab, func.tokenize, custom_vocab) == tokenized_data
Пример #2
0
def build_labeled_dataset(sentences, labels, vocab, save_string, split, label_map, custom_vocab={}):
    """
        Builds and saves a TrainingDataset that is used for strict-match training and evaluation

        Arguments:
            sentences          (arr) : array of sentences
            labels             (arr) : array of labels (strings)
            vocab (torch.text.Vocab) : Vocab object
            save_string        (str) : string to indicate some of the hyper-params used to create the vocab
            split              (str) : what to name this dataset
            label_map         (dict) : key - label name, value - label_id
            custom_vocab      (dict) : key - string, value - token_id
    """
    pad_idx = vocab["<pad>"]
    seq_tokens = convert_text_to_tokens(sentences, vocab, tokenize, custom_vocab)
    
    label_ids = _prepare_labels(labels, label_map)

    dataset = TrainingDataset(seq_tokens, label_ids, pad_idx)

    print("Finished building {} dataset of size: {}".format(split, str(len(seq_tokens))))

    file_name = "../data/training_data/{}_data_{}.p".format(split, save_string)

    with open(file_name, "wb") as f:
        pickle.dump(dataset, f)
Пример #3
0
def build_real_pretraining_triples(sentences,
                                   queries,
                                   vocab,
                                   tokenize_fn,
                                   custom_vocab={}):
    """
        To evaluate a model against real explanations

        Arguments:
            sentences (arr) : original sentences that an explanation is about
            queries   (arr) : queries from explanations
            vocab (torchtext.vocab) : vocabulary object
            tokenize_fun (function) : function to use to break up text into tokens
        
        Returns:
            tokenized seqs, queries, labels : triplet where each element is a list of equal length
                                              containing similar information to `build_synthetic_pretraining_triples`
    """
    tokenized_sentences = convert_text_to_tokens(sentences, vocab, tokenize_fn,
                                                 custom_vocab)
    tokenized_queries = convert_text_to_tokens(queries, vocab, tokenize_fn)
    labels = []
    indices_to_delete = []
    for i, tokenized_sentence in enumerate(tokenized_sentences):
        tokenized_query = tokenized_queries[i]
        sent_labels = [0] * len(tokenized_sentence)
        start_position = find_array_start_position(tokenized_sentence,
                                                   tokenized_query)
        if start_position > 0:
            sent_labels[start_position:start_position +
                        len(tokenized_query)] = [1] * len(tokenized_query)
            labels.append(sent_labels)
        else:
            indices_to_delete.append(i)

    indices_to_delete.reverse()

    for i in indices_to_delete:
        del tokenized_sentences[i]
        del tokenized_queries[i]

    return tokenized_sentences, tokenized_queries, labels
Пример #4
0
def build_synthetic_pretraining_triples(data,
                                        vocab,
                                        tokenize_fn,
                                        custom_vocab={}):
    """
        As per the NExT paper, we build a pre-training dataset from a dataset of unlabeled text.
        The process is as follows per sequence of text (Seq):
            1. Tokenize the text
            2. Convert tokens into token_ids
            3. Select a random number (N) between 1 and 5 for the number of tokens that make up a query (Q)
            4. Select a starting position in the sequence (S)
            5. Extract the tokens [S:S+N], this is our query sequence Q
            6. Label each token in Seq with a 1 or 0, indicating whether the token is in Q or not

        As a result of this process we build the triple (Seq, Q, labels) that will be used in pre-training

        Arguments:
            data              (arr) : sequences of text
            vocab (torchtext.vocab) : vocabulary object
            tokenize_fun (function) : function to use to break up text into tokens
        
        Returns:
            tokenized seqs, queries, labels : triplet where each element is a list of equal length
                                              containing the information described above
    """
    token_seqs = convert_text_to_tokens(data, vocab, tokenize_fn, custom_vocab)
    token_seqs = [token_seq for token_seq in token_seqs if len(token_seq) > 3]
    queries = []
    labels = []
    for i, token_seq in enumerate(token_seqs):
        num_tokens = random.randint(1, min(len(token_seq), 5))
        starting_position = random.randint(0, len(token_seq) - num_tokens)
        end_position = starting_position + num_tokens
        queries.append(token_seq[starting_position:end_position])
        label_seq = []
        for i in range(len(token_seq)):
            if i >= starting_position and i < end_position:
                label_seq.append(1.0)
            else:
                label_seq.append(0.0)
        labels.append(label_seq)

    return token_seqs, queries, labels
Пример #5
0
def tokenize_explanation_queries(explanation_data, vocab, label_filter,
                                 save_string):
    """
        Given a list of explanations for labeling decisions, we find those explanations that include phrases
        that must exist in a text-sequence for a label to be applied to the text sequence.

            Ex: The text contains the phrase "xyz"

        We then tokenize and convert the phrases within quotes (queries) into sequence of token_ids that will
        be used at training time to try and push embeddings of queries associated with the same label closer
        together.

        Arguments:
            explanation_data  (arr) : array of natural language explanations for labeling decisions
            vocab (torchtext.vocab) : vocab object used for conversion between text token and token_id
            label_filter      (arr) : labels to consider when extracting queries from explanations
                                      (allows user to ignore explanations associated with certain labels)
            save_string       (str) : string to indicate some of the hyper-params used to create the vocab
    """
    queries = []
    labels = []
    for entry in explanation_data:
        explanation = entry["explanation"]
        label = entry["label"]
        if label_filter is None or label in label_filter:
            possible_queries = extract_queries_from_explanations(explanation)
            for query in possible_queries:
                queries.append(query)
                labels.append(label)

    tokenized_queries = convert_text_to_tokens(queries, vocab, tokenize)

    print("Finished tokenizing actual queries, count: {}".format(
        str(len(tokenized_queries))))

    file_name = "../data/pre_train_data/sim_data_{}.p".format(save_string)

    with open(file_name, "wb") as f:
        pickle.dump({"queries": tokenized_queries, "labels": labels}, f)
Пример #6
0
def build_word_to_idx(raw_explanations, vocab, save_string):
    """
        This datastructure is needed to map rows from Find Module output to explanations. If explanation_i contains a 
        phrase_j, then we can look up phrase_j in the quoted_words_to_index dictionary to get the index of the tensor containing
        phrase_j's comparison scores to all unlabeled instances. Used in creating soft_matching labels.

        Main Datastructure Being Created: quoted_words_to_index

        Arguments:
            raw_explanations  (dict) : key - semantic_rep of explanation, value - raw explanation text
            vocab (torch.text.Vocab) : Vocab object
            save_string        (str) : string to indicate some of the hyper-params used to create the vocab
    """
    quoted_words = {}
    for i, key in enumerate(raw_explanations):
        explanation = raw_explanations[key]
        queries = extract_queries_from_explanations(explanation)
        for query in queries:
            query = " ".join(tokenize(query)).strip()
            quoted_words[query] = 1
    
    quoted_words = list(quoted_words.keys())

    tokenized_queries = convert_text_to_tokens(quoted_words, vocab, lambda x: x.split())

    print("Finished tokenizing actual queries, count: {}".format(str(len(tokenized_queries))))

    file_name = "../data/training_data/query_tokens_{}.p".format(save_string)
    with open(file_name, "wb") as f:
        pickle.dump(tokenized_queries, f)

    quoted_words_to_index = {}
    for i, quoted_word in enumerate(quoted_words):
        quoted_words_to_index[quoted_word] = i
    
    file_name = "../data/training_data/word2idx_{}.p".format(save_string)
    with open(file_name, "wb") as f:
        pickle.dump(quoted_words_to_index, f)