def skipthoughts_vectors(doc, sentences, encoder_decoder): ''' Creates skipthoughts vector for doc and corpus for a given encoder_decoder The encode function produces an array of skipthought vectors with as many rows as there were sentences and 4800 dimensions. See the combine-skip section of the skipthoughts paper for a detailed explanation of the array. Args: doc (str): the text of the document (before any normalization) corpus (list): the first and last sentences of each document in the corpus encoder_decoder (???): the skipthoughts encoder/decoder Returns: array: the concatenation of the corpus skipthoughts vector (the average of each indivdual skipthoughts vector) and the document skipthoughts vector (the average of the first and last sentence's skipthoughts vector) ''' corpus_vectors = sk.encode(encoder_decoder, sentences) corpus_vector = np.mean(corpus_vectors, axis = 0) doc_vector = np.mean(sk.encode(encoder_decoder, get_first_and_last_sentence(doc)), axis=0) skipthoughts = [doc_vector, corpus_vector] return skipthoughts
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder): ''' Generates observations to be fed into the mem_net code Args: raw_doc (string): the raw document text raw_corpus (str): the raw corpus text sentences_full (list): list of all sentences in the corpus mem_net_params (dict): the specified features to be calculated for mem_net vocab (dict): the vocabulary of the data set w2v_model: the word2vec model of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: doc_input (array): the corpus data, known in mem_nets as the input doc_questions: the document data, known in mem_nets as the question doc_masks: the mask for the input data - tells mem_net where the end of each input is this can be per word for the end of a sentence ''' # Use the specified mask mode where available if mem_net_params.get('mask_mode', False): mask_mode = mem_net_params["mask_mode"] else: mask_mode = 'sentence' if mem_net_params.get('embed_mode', False): embed_mode = mem_net_params['embed_mode'] else: embed_mode = 'word2vec' if embed_mode == 'skip_thought': from src.featurizers.skipthoughts import skipthoughts as sk doc_sentences = tokenize.punkt_sentences(raw_doc) # Ensure that the document and corpus are long enough and if not make them be long enough if len(sentences_full) == 1: #print("short corpus") sentences_full.extend(sentences_full) if len(doc_sentences) == 1: #print("short doc") doc_sentences.extend(doc_sentences) corpus_vectors = sk.encode(encoder_decoder, sentences_full) doc_vectors = sk.encode(encoder_decoder, doc_sentences) # Since each entry is a sentence, we use the index of each entry for the mask # We cannot use a word mode in this embedding doc_masks = [index for index, w in enumerate(corpus_vectors)] doc_questions = doc_vectors doc_input = corpus_vectors elif embed_mode == 'onehot': min_length = None max_length = None if mem_net_params.get('onehot_min_len', False): min_length = mem_net_params['onehot_min_len'] if mem_net_params.get('onehot_max_len', False): max_length = mem_net_params['onehot_max_len'] onehot_vocab = full_vocab # Preprocess and tokenize bkgd documents corpus_tokens = tokenize.word_punct_tokens( normalize.xml_normalize(raw_corpus)) corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab) corpus_indices = encode_doc(corpus_tokens, onehot_vocab) # Get sentence mask indices assert {'.', ',', '!', '?'} <= onehot_vocab.keys( ) # ensure that you are using a vocabulary w/ punctuation sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length) # One-hot encode documents w/ masks, and query document corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab)) corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True) # Tokenize and one-hot encode query document doc_vectors = run_onehot( tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), onehot_vocab, min_length, max_length) doc_questions = doc_vectors.T doc_input = corpus_vectors.T if mask_mode == 'sentence': doc_masks = sentence_mask else: doc_masks = [index for index, w in enumerate(doc_input)] elif embed_mode == 'word2vec': corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode) doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode) if len(corpus_vectors) > 0 and len(doc_vectors) > 0: doc_questions = doc_vectors doc_input = corpus_vectors return doc_input, doc_questions, doc_masks
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder): ''' Generates observations to be fed into the mem_net code Args: raw_doc (string): the raw document text raw_corpus (str): the raw corpus text sentences_full (list): list of all sentences in the corpus mem_net_params (dict): the specified features to be calculated for mem_net vocab (dict): the vocabulary of the data set w2v_model: the word2vec model of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: doc_input (array): the corpus data, known in mem_nets as the input doc_questions: the document data, known in mem_nets as the question doc_masks: the mask for the input data - tells mem_net where the end of each input is this can be per word for the end of a sentence ''' # Use the specified mask mode where available if mem_net_params.get('mask_mode', False): mask_mode = mem_net_params["mask_mode"] else: mask_mode = 'sentence' if mem_net_params.get('embed_mode', False): embed_mode = mem_net_params['embed_mode'] else: embed_mode = 'word2vec' if embed_mode == 'skip_thought': from src.featurizers.skipthoughts import skipthoughts as sk doc_sentences = tokenize.punkt_sentences(raw_doc) # Ensure that the document and corpus are long enough and if not make them be long enough if len(sentences_full)==1: #print("short corpus") sentences_full.extend(sentences_full) if len(doc_sentences)==1: #print("short doc") doc_sentences.extend(doc_sentences) corpus_vectors = sk.encode(encoder_decoder, sentences_full) doc_vectors = sk.encode(encoder_decoder, doc_sentences) # Since each entry is a sentence, we use the index of each entry for the mask # We cannot use a word mode in this embedding doc_masks = [index for index, w in enumerate(corpus_vectors)] doc_questions = doc_vectors doc_input = corpus_vectors elif embed_mode == 'onehot': min_length = None max_length = None if mem_net_params.get('onehot_min_len', False): min_length = mem_net_params['onehot_min_len'] if mem_net_params.get('onehot_max_len', False): max_length = mem_net_params['onehot_max_len'] onehot_vocab=full_vocab # Preprocess and tokenize bkgd documents corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus)) corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab) corpus_indices = encode_doc(corpus_tokens, onehot_vocab) # Get sentence mask indices assert {'.',',','!','?'} <= onehot_vocab.keys() # ensure that you are using a vocabulary w/ punctuation sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length) # One-hot encode documents w/ masks, and query document corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab)) corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True) # Tokenize and one-hot encode query document doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), onehot_vocab, min_length, max_length) doc_questions = doc_vectors.T doc_input = corpus_vectors.T if mask_mode=='sentence': doc_masks = sentence_mask else: doc_masks = [index for index, w in enumerate(doc_input)] elif embed_mode == 'word2vec': corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode) doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode) if len(corpus_vectors)>0 and len(doc_vectors)>0: doc_questions = doc_vectors doc_input = corpus_vectors return doc_input, doc_questions, doc_masks