Пример #1
0
def skipthoughts_vectors(doc, sentences, encoder_decoder):
    '''
    Creates skipthoughts vector for doc and corpus for a given encoder_decoder

    The encode function produces an array of skipthought vectors with as many rows as there were sentences and 4800 dimensions.
    See the combine-skip section of the skipthoughts paper for a detailed explanation of the array.

    Args:
        doc (str): the text of the document (before any normalization)
        corpus (list): the first and last sentences of each document in the corpus
        encoder_decoder (???): the skipthoughts encoder/decoder

    Returns:
        array: the concatenation of the corpus skipthoughts vector (the average of each indivdual skipthoughts vector) and the document skipthoughts vector (the average of the first and last sentence's skipthoughts vector)
    '''
    corpus_vectors = sk.encode(encoder_decoder, sentences)
    corpus_vector = np.mean(corpus_vectors, axis = 0)
    doc_vector = np.mean(sk.encode(encoder_decoder, get_first_and_last_sentence(doc)), axis=0)
    skipthoughts = [doc_vector, corpus_vector]
    return skipthoughts
Пример #2
0
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full,
                             mem_net_params, vocab, full_vocab, w2v_model,
                             encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else:
        mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else:
        embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full) == 1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences) == 1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors

    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab = full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(
            normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.', ',', '!', '?'} <= onehot_vocab.keys(
        )  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices,
                                 onehot_vocab,
                                 max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded,
                                    onehot_vocab,
                                    min_length,
                                    max_length,
                                    already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(
            tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)),
            onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode == 'sentence':
            doc_masks = sentence_mask
        else:
            doc_masks = [index for index, w in enumerate(doc_input)]

    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus,
                                                   mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params,
                                        mask_mode)

        if len(corpus_vectors) > 0 and len(doc_vectors) > 0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
Пример #3
0
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else: mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else: embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full)==1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences)==1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors


    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab=full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.',',','!','?'} <= onehot_vocab.keys()  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), 
                                    onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode=='sentence':
            doc_masks = sentence_mask
        else: doc_masks = [index for index, w in enumerate(doc_input)]


    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode)

        if len(corpus_vectors)>0 and len(doc_vectors)>0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks