def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False): """Encode a list of sentences given the model. """ if desc == 'skipthought': # encode a sentence list directly features = skipthoughts.encode(model, sentence_list, verbose=False) elif desc == 'vis-text-embed': # normalize sentence lists norm_sentence_list = [ utils.normalize_alphanumeric(sentence.lower()) for sentence in sentence_list ] # allows to encode a sentence list directly features = model.encode(norm_sentence_list) elif desc.startswith('tfidf'): desc_dim = len(model.vocab) midx = model.doc_names.index(imdb_key) # use scipy sparse matrix when encoding stories, otherwise too huge! if is_qa: features = np.zeros((len(sentence_list), desc_dim), dtype='float32') else: features = sps.dok_matrix((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use both alphanumeric and stemming normalization sentence = utils.normalize_stemming( utils.normalize_alphanumeric(sentence.lower())).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue widx = model.vocab.index(word) features[s, widx] = model.tfidf[widx][midx] if is_qa: # if not sparse, use numpy.linalg.norm features[s] /= (np.linalg.norm(features[s]) + 1e-6) else: # if sparse, use scipy.sparse.linalg.norm features[s] /= (sps.linalg.norm(features[s]) + 1e-6) elif desc == 'word2vec': desc_dim = model.get_vector(model.vocab[-1]).shape[0] features = np.zeros((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use only alphanumeric normalization, no stemming sentence = utils.normalize_alphanumeric( sentence.lower()).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue features[s] += model.get_vector(word) features[s] /= (np.linalg.norm(features[s]) + 1e-6) return features
def create_vocabulary(self, QAs, stories, v2i, w2v_vocab=None, word_thresh=1): """Create the vocabulary by taking all words in stories, questions, and answers taken together. Also, keep only words that appear in the word2vec model vocabulary (if provided with one). """ print "Creating vocabulary.", if w2v_vocab is not None: print "Adding words based on word2vec" else: print "Adding all words" # Get all story words all_words = [ word for story in stories for sent in story for word in sent ] # Parse QAs to get actual words QA_words = [] for QA in QAs: QA_words.append({}) QA_words[-1]['q_w'] = utils.normalize_alphanumeric( QA.question.lower()).split(' ') QA_words[-1]['a_w'] = [ utils.normalize_alphanumeric(answer.lower()).split(' ') for answer in QA.answers ] # Append question and answer words to all_words for QAw in QA_words: all_words.extend(QAw['q_w']) for answer in QAw['a_w']: all_words.extend(answer) # threshold vocabulary, at least N instances of every word vocab = Counter(all_words) vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh] # create vocabulary index for w in vocab: if w not in v2i.keys(): if w2v_vocab is None: # if word2vec is not provided, just dump the word to vocab v2i[w] = len(v2i) elif w2v_vocab is not None and w in w2v_vocab: # check if word in vocab, or else ignore v2i[w] = len(v2i) print "Created a vocabulary of %d words. Threshold removed %.2f %% words" \ %(len(v2i), 100*(1. * len(set(all_words)) - len(v2i))/len(all_words)) return QA_words, v2i
def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False): """Encode a list of sentences given the model. """ if desc == 'skipthought': # encode a sentence list directly features = skipthoughts.encode(model, sentence_list, verbose=False) elif desc == 'vis-text-embed': # normalize sentence lists norm_sentence_list = [utils.normalize_alphanumeric(sentence.lower()) for sentence in sentence_list] # allows to encode a sentence list directly features = model.encode(norm_sentence_list) elif desc.startswith('tfidf'): desc_dim = len(model.vocab) midx = model.doc_names.index(imdb_key) # use scipy sparse matrix when encoding stories, otherwise too huge! if is_qa: features = np.zeros((len(sentence_list), desc_dim), dtype='float32') else: features = sps.dok_matrix((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use both alphanumeric and stemming normalization sentence = utils.normalize_stemming(utils.normalize_alphanumeric(sentence.lower())).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue widx = model.vocab.index(word) features[s,widx] = model.tfidf[widx][midx] if is_qa: # if not sparse, use numpy.linalg.norm features[s] /= (np.linalg.norm(features[s]) + 1e-6) else: # if sparse, use scipy.sparse.linalg.norm features[s] /= (sps.linalg.norm(features[s]) + 1e-6) elif desc == 'word2vec': desc_dim = model.get_vector(model.vocab[-1]).shape[0] features = np.zeros((len(sentence_list), desc_dim), dtype='float32') for s, sentence in enumerate(sentence_list): # NOTE: use only alphanumeric normalization, no stemming sentence = utils.normalize_alphanumeric(sentence.lower()).split(' ') # for each word in the normalized sentence for word in sentence: if word not in model.vocab: continue features[s] += model.get_vector(word) features[s] /= (np.linalg.norm(features[s]) + 1e-6) return features
def encode_tfidf_model(document_type, word_thresh=1): """Load TF-IDF model. """ tfidf_fname = utils.TFIDF_TEMPLATE % (document_type, word_thresh) check_save_directory(filename=tfidf_fname) if os.path.exists(tfidf_fname): with open(tfidf_fname, 'rb') as fid: TFIDF = pickle.load(fid) else: # read the story and gather words story, _ = mqa.get_story_qa_data('full', document_type) sorted_movies = sorted(story.keys()) all_words_use = [] for imdb_key in sorted_movies: all_words_use.append([]) for sentence in story[imdb_key]: norm_sentence = utils.normalize_stemming( utils.normalize_alphanumeric(sentence.lower())) all_words_use[-1].extend(norm_sentence.split(' ')) # compute TFIDF TFIDF = tfidfcalc.TFIDF(sorted_movies) TFIDF.get_filtered_vocabulary(all_words_use, word_thresh=word_thresh) TFIDF.compute_tfidf(all_words_use) # dump to pickle file for future with open(tfidf_fname, 'wb') as fid: pickle.dump(TFIDF, fid) return TFIDF
def encode_tfidf_model(document_type, word_thresh=1): """Load TF-IDF model. """ tfidf_fname = utils.TFIDF_TEMPLATE %(document_type, word_thresh) check_save_directory(filename=tfidf_fname) if os.path.exists(tfidf_fname): with open(tfidf_fname, 'rb') as fid: TFIDF = pickle.load(fid) else: # read the story and gather words story, _ = mqa.get_story_qa_data('full', document_type) sorted_movies = sorted(story.keys()) all_words_use = [] for imdb_key in sorted_movies: all_words_use.append([]) for sentence in story[imdb_key]: norm_sentence = utils.normalize_stemming(utils.normalize_alphanumeric(sentence.lower())) all_words_use[-1].extend(norm_sentence.split(' ')) # compute TFIDF TFIDF = tfidfcalc.TFIDF(sorted_movies) TFIDF.get_filtered_vocabulary(all_words_use, word_thresh=word_thresh) TFIDF.compute_tfidf(all_words_use) # dump to pickle file for future with open(tfidf_fname, 'wb') as fid: pickle.dump(TFIDF, fid) return TFIDF
def encode_w2v_gensim(sentence): #embedding = list() embedding = np.zeros(300) sentence = utils.normalize_alphanumeric(sentence.lower()) word_list = sentence.split() #word_list = word_tokenize(sentence) word_size = 0 for word in word_list: if word in ignore_word_list : continue try: embedding = embedding + gensim_model[word] if nan_check(embedding): print 'nan word >> ', word embed() word_size += 1 #embedding.extend(list(gensim_model[word])) except: pass #print "KEY ERROR : " + word #print "Full sentence >> ", #print word_list #if word_size > word_clip_size: embedding = embedding[:word_clip_size*w2v_dim] #elif word_size < word_clip_size : embedding.extend([0.0]*w2v_dim*(word_clip_size-word_size)) #print len(embedding) #assert len(embedding) == w2v_dim * word_clip_size embedding_norm = np.sum(embedding**2) embedding = embedding / (embedding_norm + 1e-6) assert embedding.shape == (300, ) return embedding
def create_vocabulary(QAs, stories, v2i, w2v_vocab=None, word_thresh=2): """Create the vocabulary by taking all words in stories, questions, and answers taken together. Also, keep only words that appear in the word2vec model vocabulary (if provided with one). """ print "Creating vocabulary.", if w2v_vocab is not None: print "Adding words based on word2vec" else: print "Adding all words" # Get all story words all_words = [word for story in stories for sent in story for word in sent] # Parse QAs to get actual words QA_words = [] for QA in QAs: QA_words.append({}) QA_words[-1]['q_w'] = utils.normalize_alphanumeric(QA.question.lower()).split(' ') QA_words[-1]['a_w'] = [utils.normalize_alphanumeric(answer.lower()).split(' ') for answer in QA.answers] # Append question and answer words to all_words for QAw in QA_words: all_words.extend(QAw['q_w']) for answer in QAw['a_w']: all_words.extend(answer) # threshold vocabulary, at least N instances of every word vocab = Counter(all_words) vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh] # create vocabulary index for w in vocab: if w not in v2i.keys(): if w2v_vocab is None: # if word2vec is not provided, just dump the word to vocab v2i[w] = len(v2i) elif w2v_vocab is not None and w in w2v_vocab: # check if word in vocab, or else ignore v2i[w] = len(v2i) print "Created a vocabulary of %d words. Threshold removed %.2f %% words" \ %(len(v2i), 100*(1. * len(set(all_words)) - len(v2i))/len(all_words)) return QA_words, v2i
def normalize_documents(stories, normalize_for=('lower', 'alphanumeric'), max_words=40): """Normalize all stories in the dictionary, get list of words per sentence. """ for movie in stories.keys(): for s, sentence in enumerate(stories[movie]): sentence = sentence.lower() if 'alphanumeric' in normalize_for: sentence = utils.normalize_alphanumeric(sentence) sentence = sentence.split(' ')[:max_words] stories[movie][s] = sentence return stories
def answer_length(QA): """Hasty student answering questions based on the length of the answers. """ shortest, longest, different = {}, {}, {} for qa in QA: # get all answer lengths ans_length = np.zeros((5)) for k, ans in enumerate(qa.answers): ans_length[k] = len(utils.normalize_stemming(utils.normalize_alphanumeric(ans))) # pick shortest answer shortest.update({qa.qid:np.argmin(ans_length)}) # pick longest answer longest.update({qa.qid:np.argmax(ans_length)}) # pick most different sized answer mean_length = np.mean(ans_length) different.update({qa.qid:np.argmax(np.abs(ans_length - mean_length))}) answer_options = {'hasty-shortest': shortest, 'hasty-longest': longest, 'hasty-different': different} return answer_options