def process_sent(doc, word2vec, vocab, ivocab, word_vector_size, to_return="word2vec", silent=False, encoder_decoder=None, vocab_dict={}): document_vector = [] if to_return == "word2vec": document_vector = [ process_word(w, word2vec, vocab, ivocab, word_vector_size, to_return, silent=True) for w in doc ] elif to_return == "skip_thought": sentences = punkt_sentences(doc) norm_sentences = [normalize.xml_normalize(s) for s in sentences] document_vector = [sk.encode(encoder_decoder, norm_sentences)] elif to_return == "one_hot": data_gen.run_onehot(doc, vocab_dict) return document_vector
def build_w2v(trainingdata, min_count=5, window=5, size=100, workers=3, pretrained=False, **kwargs): ''' Fits a Word2Vec topic model based on the training corpus sentences. Args: trainingdata (list): A list containing the training corpus as parsed JSON text min_count (int): ignore all words with total frequency lower than this number window (int): maximum distance between the current and predicted word within a sentence size (int): dimensionality of the feature vectors workers (int): use this many worker threads to train the model (faster training with multicore machines) Returns: Word2Vec: A pretrained Word2Vec model from Google or a Word2Vec model fit to the training data sentences ''' # Suppress gensim's INFO messages logging.getLogger("gensim").setLevel(logging.WARNING) # Use Google's pretrained Word2Vec model if pretrained: # Look at environment variable 'PYTHIA_MODELS_PATH' for user-defined model location # If environment variable is not defined, use current working directory if os.environ.get('PYTHIA_MODELS_PATH') is not None: path_to_models = os.environ.get('PYTHIA_MODELS_PATH') else: path_to_models = os.path.join(os.getcwd(), 'models') # Make the directory for the models unless it already exists try: os.makedirs(path_to_models) except OSError as exception: if exception.errno != errno.EEXIST: raise # Look for Google's trained Word2Vec model as a binary or zipped file; Return error and quit if not found if os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin")): w2v_model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True) elif os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz")): with gzip.open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz"), 'rb') as f_in: with open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) w2v_model = gensim.models.Word2Vec.load_word2vec_format( os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True) else: print("""Error: Google's pretrained Word2Vec model GoogleNews-vectors-negative300.bin was not found in %s Set 'pretrained=False' or download/unzip GoogleNews-vectors-negative300.bin.gz from https://code.google.com/archive/p/word2vec/ into %s""" % (path_to_models,path_to_models), file=sys.stderr) quit() # Train a Word2Vec model with the corpus else: sentencearray = [] for entry in trainingdata: sentences = tokenize.punkt_sentences(xml_normalize(entry['body_text'])) for sentence in sentences: words = tokenize.word_punct_tokens(sentence) sentencearray.append(words) w2v_model = gensim.models.Word2Vec(sentencearray, min_count=min_count, window=window, size=size, workers=workers) return w2v_model
def process_sent(doc, word2vec, vocab, ivocab, word_vector_size, to_return="word2vec", silent=False, encoder_decoder=None, vocab_dict={}): document_vector = [] if to_return=="word2vec": document_vector = [process_word(w, word2vec, vocab, ivocab , word_vector_size, to_return, silent=True) for w in doc] elif to_return=="skip_thought": sentences = punkt_sentences(doc) norm_sentences = [normalize.xml_normalize(s) for s in sentences] document_vector = [ sk.encode(encoder_decoder, norm_sentences)] elif to_return=="one_hot": data_gen.run_onehot(doc, vocab_dict) return document_vector
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode): #determine if the first and last sentences will be taken or all sentences if w2v_params.get('mem_w2v_mode', False): w2v_mode = w2v_params['mem_w2v_mode'] else: w2v_mode = 'all' if w2v_mode == 'all': sentences = tokenize.punkt_sentences(doc) else: sentences = get_first_and_last_sentence(doc) normalizedsentences = [] sentence_mask = [] for sentence in sentences: words = tokenize.word_punct_tokens(sentence) if len(sentence_mask) > 0: prev_mask = sentence_mask[-1] else: prev_mask = -1 sentence_mask.append(prev_mask + len(words)) normalizedsentences.append(words) wordvectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: for word in phrase: wordvector = None try: wordvector_ = w2v_model[word] wordvector = [float(w) for w in wordvector_] except: wordvector = w2v_model.seeded_vector(np.random.rand()) if wordvector is not None: wordvectorarray.append(wordvector) if mask_mode == 'sentence': mask = sentence_mask else: mask = np.array([index for index, w in enumerate(wordvectorarray)], dtype=np.int32) if len(wordvectorarray) - 1 != mask[-1]: print(mask) print(np.array(wordvectorarray).shape) raise return np.vstack(wordvectorarray), mask
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode): #determine if the first and last sentences will be taken or all sentences if w2v_params.get('mem_w2v_mode', False): w2v_mode = w2v_params['mem_w2v_mode'] else: w2v_mode = 'all' if w2v_mode == 'all': sentences = tokenize.punkt_sentences(doc) else: sentences = get_first_and_last_sentence(doc) normalizedsentences = [] sentence_mask = [] for sentence in sentences: words = tokenize.word_punct_tokens(sentence) if len(sentence_mask)>0: prev_mask = sentence_mask[-1] else: prev_mask = -1 sentence_mask.append(prev_mask + len(words)) normalizedsentences.append(words) wordvectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: for word in phrase: wordvector = None try: wordvector_ = w2v_model[word] wordvector = [float(w) for w in wordvector_] except: wordvector = w2v_model.seeded_vector(np.random.rand()) if wordvector is not None: wordvectorarray.append(wordvector) if mask_mode=='sentence': mask = sentence_mask else: mask = np.array([index for index, w in enumerate(wordvectorarray)], dtype=np.int32) if len(wordvectorarray)-1!=mask[-1]: print(mask) print(np.array(wordvectorarray).shape) raise return np.vstack(wordvectorarray), mask
def get_first_and_last_sentence(doc): ''' Finds the first and last sentance of a document and normalizes them. Args: doc (str): the text of the document (before any preprocessing) Returns: array: the first and last sentance after normalizing ''' sentences = tokenize.punkt_sentences(doc) first = normalize.xml_normalize(sentences[0]) last = normalize.xml_normalize(sentences[-1]) first_and_last = [first, last] return first_and_last
def get_first_and_last_sentence(doc): ''' Finds the first and last sentance of a document and normalizes them. Args: doc (str): the text of the document (before any preprocessing) Returns: array: the first and last sentance after normalizing ''' sentences = tokenize.punkt_sentences(doc) first = normalize.xml_normalize(sentences[0]) last = normalize.xml_normalize(sentences[-1]) # Protect against scenario where last sentence is mistakenly returned by parser as empty list if len(last) == 0: i = -2 while len(last) == 0: last = normalize.xml_normalize(sentences[i]) i -= 1 first_and_last = [first, last] return first_and_last
def get_first_and_last_sentence(doc): ''' Finds the first and last sentance of a document and normalizes them. Args: doc (str): the text of the document (before any preprocessing) Returns: array: the first and last sentance after normalizing ''' sentences = tokenize.punkt_sentences(doc) first = normalize.xml_normalize(sentences[0]) last = normalize.xml_normalize(sentences[-1]) # Protect against scenario where last sentence is mistakenly returned by parser as empty list if len(last)==0: i = -2 while len(last)==0: last = normalize.xml_normalize(sentences[i]) i-=1 first_and_last = [first, last] return first_and_last
def test_punkt(): """Test sentence tokenization""" assert tokenize.punkt_sentences("S1. S2. S3! S4!!!") == ["S1.", "S2.", "S3!", "S4!!", "!"] assert tokenize.punkt_sentences("S1. S4!!!") == ["S1.", "S4!!", "!"]
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (dict): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.', '?', '!'] corpus_unprocessed = list() # HDF5-related parameters hdf5_save_frequency = parameters['hdf5_save_frequency'] data_key = 'data' labels_key = 'labels' # Truncate any existing files at save location, or return early if # using existing files if hdf5_path is not None: if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path): return hdf5_path, hdf5_path open(hdf5_path, 'w').close() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster sorted_entries = [ x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0]) ] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty': next_doc['novelty'], 'data': copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: resampling_parameters = parameters['resampling'] if resampling_parameters.get('over', False): desired_size = None resampling_parameters['replacement'] = True else: desired_size = -np.Inf if resampling_parameters.get('replacement', False): replacement = True else: replacement = False logger.debug("Replacement: {}, Desired size: {}".format( replacement, desired_size)) logger.debug("Size of data: {}, Number of clusters: {}".format( len(corpus_unprocessed), len(all_clusters))) corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [record['body_text'] for record in case['data']] case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ] case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data']][-1] postids.append(postid) clusterid = [record['cluster_id'] for record in case['data']][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] doc_no_stop_words = case_docs_no_stop_words[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations( doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words, bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn( normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype) # Fail catastrphically on zero vector (not sure if we need this) #assert not (feature_vectors < 0.0001).all() data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) # save to HDF5 if desired if hdf5_path is not None and len(data) % hdf5_save_frequency == 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) labels = list() data = list() # Save off any remainder if hdf5_path is not None and len(data) > 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = [ "C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids, postids) ] if 'mem_net' in features: return mem_net_features, labels, ids if hdf5_path is not None: return hdf5_path, hdf5_path, ids else: return data, labels, ids
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder): ''' Generates observations to be fed into the mem_net code Args: raw_doc (string): the raw document text raw_corpus (str): the raw corpus text sentences_full (list): list of all sentences in the corpus mem_net_params (dict): the specified features to be calculated for mem_net vocab (dict): the vocabulary of the data set w2v_model: the word2vec model of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: doc_input (array): the corpus data, known in mem_nets as the input doc_questions: the document data, known in mem_nets as the question doc_masks: the mask for the input data - tells mem_net where the end of each input is this can be per word for the end of a sentence ''' # Use the specified mask mode where available if mem_net_params.get('mask_mode', False): mask_mode = mem_net_params["mask_mode"] else: mask_mode = 'sentence' if mem_net_params.get('embed_mode', False): embed_mode = mem_net_params['embed_mode'] else: embed_mode = 'word2vec' if embed_mode == 'skip_thought': from src.featurizers.skipthoughts import skipthoughts as sk doc_sentences = tokenize.punkt_sentences(raw_doc) # Ensure that the document and corpus are long enough and if not make them be long enough if len(sentences_full) == 1: #print("short corpus") sentences_full.extend(sentences_full) if len(doc_sentences) == 1: #print("short doc") doc_sentences.extend(doc_sentences) corpus_vectors = sk.encode(encoder_decoder, sentences_full) doc_vectors = sk.encode(encoder_decoder, doc_sentences) # Since each entry is a sentence, we use the index of each entry for the mask # We cannot use a word mode in this embedding doc_masks = [index for index, w in enumerate(corpus_vectors)] doc_questions = doc_vectors doc_input = corpus_vectors elif embed_mode == 'onehot': min_length = None max_length = None if mem_net_params.get('onehot_min_len', False): min_length = mem_net_params['onehot_min_len'] if mem_net_params.get('onehot_max_len', False): max_length = mem_net_params['onehot_max_len'] onehot_vocab = full_vocab # Preprocess and tokenize bkgd documents corpus_tokens = tokenize.word_punct_tokens( normalize.xml_normalize(raw_corpus)) corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab) corpus_indices = encode_doc(corpus_tokens, onehot_vocab) # Get sentence mask indices assert {'.', ',', '!', '?'} <= onehot_vocab.keys( ) # ensure that you are using a vocabulary w/ punctuation sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length) # One-hot encode documents w/ masks, and query document corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab)) corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True) # Tokenize and one-hot encode query document doc_vectors = run_onehot( tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), onehot_vocab, min_length, max_length) doc_questions = doc_vectors.T doc_input = corpus_vectors.T if mask_mode == 'sentence': doc_masks = sentence_mask else: doc_masks = [index for index, w in enumerate(doc_input)] elif embed_mode == 'word2vec': corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode) doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode) if len(corpus_vectors) > 0 and len(doc_vectors) > 0: doc_questions = doc_vectors doc_input = corpus_vectors return doc_input, doc_questions, doc_masks
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (???): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster # sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: if 'over' in parameters: desired_size = None parameters['replacement'] = True else: desired_size = -np.Inf if 'replacement' in parameters: replacement = True else: replacement = False corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data'] ][-1] postids.append(postid) clusterid = [ record['cluster_id'] for record in case['data'] ][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0) data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)] if 'mem_net' in features: return mem_net_features, labels, ids else: return data, labels, ids
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder): ''' Generates observations to be fed into the mem_net code Args: raw_doc (string): the raw document text raw_corpus (str): the raw corpus text sentences_full (list): list of all sentences in the corpus mem_net_params (dict): the specified features to be calculated for mem_net vocab (dict): the vocabulary of the data set w2v_model: the word2vec model of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: doc_input (array): the corpus data, known in mem_nets as the input doc_questions: the document data, known in mem_nets as the question doc_masks: the mask for the input data - tells mem_net where the end of each input is this can be per word for the end of a sentence ''' # Use the specified mask mode where available if mem_net_params.get('mask_mode', False): mask_mode = mem_net_params["mask_mode"] else: mask_mode = 'sentence' if mem_net_params.get('embed_mode', False): embed_mode = mem_net_params['embed_mode'] else: embed_mode = 'word2vec' if embed_mode == 'skip_thought': from src.featurizers.skipthoughts import skipthoughts as sk doc_sentences = tokenize.punkt_sentences(raw_doc) # Ensure that the document and corpus are long enough and if not make them be long enough if len(sentences_full)==1: #print("short corpus") sentences_full.extend(sentences_full) if len(doc_sentences)==1: #print("short doc") doc_sentences.extend(doc_sentences) corpus_vectors = sk.encode(encoder_decoder, sentences_full) doc_vectors = sk.encode(encoder_decoder, doc_sentences) # Since each entry is a sentence, we use the index of each entry for the mask # We cannot use a word mode in this embedding doc_masks = [index for index, w in enumerate(corpus_vectors)] doc_questions = doc_vectors doc_input = corpus_vectors elif embed_mode == 'onehot': min_length = None max_length = None if mem_net_params.get('onehot_min_len', False): min_length = mem_net_params['onehot_min_len'] if mem_net_params.get('onehot_max_len', False): max_length = mem_net_params['onehot_max_len'] onehot_vocab=full_vocab # Preprocess and tokenize bkgd documents corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus)) corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab) corpus_indices = encode_doc(corpus_tokens, onehot_vocab) # Get sentence mask indices assert {'.',',','!','?'} <= onehot_vocab.keys() # ensure that you are using a vocabulary w/ punctuation sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length) # One-hot encode documents w/ masks, and query document corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab)) corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True) # Tokenize and one-hot encode query document doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), onehot_vocab, min_length, max_length) doc_questions = doc_vectors.T doc_input = corpus_vectors.T if mask_mode=='sentence': doc_masks = sentence_mask else: doc_masks = [index for index, w in enumerate(doc_input)] elif embed_mode == 'word2vec': corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode) doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode) if len(corpus_vectors)>0 and len(doc_vectors)>0: doc_questions = doc_vectors doc_input = corpus_vectors return doc_input, doc_questions, doc_masks
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (dict): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # HDF5-related parameters hdf5_save_frequency=parameters['hdf5_save_frequency'] data_key = 'data' labels_key = 'labels' # Truncate any existing files at save location, or return early if # using existing files if hdf5_path is not None: if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path): return hdf5_path, hdf5_path open(hdf5_path, 'w').close() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: resampling_parameters = parameters['resampling'] if resampling_parameters.get('over', False): desired_size = None resampling_parameters['replacement'] = True else: desired_size = -np.Inf if resampling_parameters.get('replacement', False): replacement = True else: replacement = False logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size)) logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters))) corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ] case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data'] ][-1] postids.append(postid) clusterid = [ record['cluster_id'] for record in case['data'] ][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] doc_no_stop_words = case_docs_no_stop_words[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words, bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype) # Fail catastrphically on zero vector (not sure if we need this) #assert not (feature_vectors < 0.0001).all() data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) # save to HDF5 if desired if hdf5_path is not None and len(data) % hdf5_save_frequency == 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) labels = list() data = list() # Save off any remainder if hdf5_path is not None and len(data) > 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)] if 'mem_net' in features: return mem_net_features, labels, ids if hdf5_path is not None: return hdf5_path, hdf5_path, ids else: return data, labels, ids
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (???): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster # sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: if 'over' in parameters: desired_size = None parameters['replacement'] = True else: desired_size = -np.Inf if 'replacement' in parameters: replacement = True else: replacement = False corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_raw[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [ get_first_and_last_sentence(doc) for doc in bkgd_docs_raw ] feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0) data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels if 'mem_net' in features: return mem_net_features, labels else: return data, labels