def gen_vocab(corpus_dict, vocab=1000, stem=False, **kwargs): ''' Generates a dictionary of words to be used as the vocabulary in features that utilize bag of words. Args: corpus_dict (OrderedDict): An ordered list of the most frequently occurring tokens in the corpus vocab_size (int): the number of words to be used in the vocabulary Returns: dict: a dictionary of size vocab_size that contains the most frequent normalized and non-stop words in the corpus ''' index = 0 vocabdict = dict() for word in corpus_dict: if len(vocabdict) < vocab: cleantext = normalize_and_remove_stop_words(word, stem) if cleantext != '': if not cleantext in vocabdict: vocabdict[cleantext] = index index+=1 else: break return vocabdict
def build_lda(trainingdata, vocabdict, topics=40, random_state=0, **kwargs): ''' Fits a LDA topic model based on the corpus vocabulary. Args: trainingdata (list): A list containing the corpus as parsed JSON text vocabdict (dict): A dictionary containing the vocabulary to be used in the LDA model topics (int): the number of topics to be used in the LDA model random_state (int or np.random.RandomState): seed value or random number generator state Returns: LatentDirichletAllocation: A LDA model fit to the training data and corpus vocabulary ''' vectorizer = CountVectorizer(analyzer = "word", vocabulary = vocabdict) trainingdocs = [] for entry in trainingdata: trainingdocs.append(normalize_and_remove_stop_words(entry['body_text'])) trainingvectors = vectorizer.transform(trainingdocs) lda_model = LatentDirichletAllocation(n_topics=topics, random_state=random_state) lda_model.fit(trainingvectors) return lda_model
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (???): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster # sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: if 'over' in parameters: desired_size = None parameters['replacement'] = True else: desired_size = -np.Inf if 'replacement' in parameters: replacement = True else: replacement = False corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_raw[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [ get_first_and_last_sentence(doc) for doc in bkgd_docs_raw ] feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0) data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels if 'mem_net' in features: return mem_net_features, labels else: return data, labels
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (dict): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # HDF5-related parameters hdf5_save_frequency=parameters['hdf5_save_frequency'] data_key = 'data' labels_key = 'labels' # Truncate any existing files at save location, or return early if # using existing files if hdf5_path is not None: if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path): return hdf5_path, hdf5_path open(hdf5_path, 'w').close() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: resampling_parameters = parameters['resampling'] if resampling_parameters.get('over', False): desired_size = None resampling_parameters['replacement'] = True else: desired_size = -np.Inf if resampling_parameters.get('replacement', False): replacement = True else: replacement = False logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size)) logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters))) corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ] case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data'] ][-1] postids.append(postid) clusterid = [ record['cluster_id'] for record in case['data'] ][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] doc_no_stop_words = case_docs_no_stop_words[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words, bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype) # Fail catastrphically on zero vector (not sure if we need this) #assert not (feature_vectors < 0.0001).all() data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) # save to HDF5 if desired if hdf5_path is not None and len(data) % hdf5_save_frequency == 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) labels = list() data = list() # Save off any remainder if hdf5_path is not None and len(data) > 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)] if 'mem_net' in features: return mem_net_features, labels, ids if hdf5_path is not None: return hdf5_path, hdf5_path, ids else: return data, labels, ids
def gen_observations(all_clusters, lookup_order, documentData, features, vocab, encoder_decoder, lda_topics): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order documentData (array): parsed JSON documents filename (str): the name of the corpus file features (namedTuple): the specified features to be calculated vocab (dict): the vocabulary of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: list: contains for each obeservation a namedtupled with the cluster_id, post_id, novelty, tfidf sum, cosine similarity, bag of words vectors and skip thoughts (scores are None if feature is unwanted) ''' # Prepare to store results of feature assessments data = list() labels = list() #Iterate through clusters found in JSON file, do feature assessments, #build a rolling corpus from ordered documents for each cluster for cluster in all_clusters: # Determine arrival order in this cluster sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] first_doc = documentData[sortedEntries[0]]["body_text"] # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary corpus = normalize.normalize_and_remove_stop_words(first_doc) # Create a document array for TFIDF corpus_array = [corpus] # Store a list of sentences in the cluster at each iteration sentences = [] sentences += (get_first_and_last_sentence(first_doc)) for index in sortedEntries[1:]: # Find next document in order raw_doc = documentData[index]["body_text"] #normalize and remove stop words from doc doc = normalize.normalize_and_remove_stop_words(raw_doc) corpus_array.append(doc) feature = list() if 'bow' in features: feature = bow(doc, corpus, corpus_array, vocab, features['bow'], feature) if 'st' in features: feature = st(raw_doc, sentences, encoder_decoder, features['st'], feature) if 'lda' in features: feature = lda(doc, corpus, vocab, lda_topics, features['lda'], feature) # Save feature and label feature = np.concatenate(feature, axis=0) data.append(feature) if documentData[index]["novelty"]: labels.append(1) else: labels.append(0) # Update corpus and add newest sentence to sentences vector corpus+=doc sentences += get_first_and_last_sentence(doc) return data, labels
def test_empty_string(): assert normalize.normalize_and_remove_stop_words("") == ""
def test_letters(): assert normalize.normalize_and_remove_stop_words( "19 cats&dogs don't eat?") == "cats dogs don t eat"
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (dict): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.', '?', '!'] corpus_unprocessed = list() # HDF5-related parameters hdf5_save_frequency = parameters['hdf5_save_frequency'] data_key = 'data' labels_key = 'labels' # Truncate any existing files at save location, or return early if # using existing files if hdf5_path is not None: if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path): return hdf5_path, hdf5_path open(hdf5_path, 'w').close() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster sorted_entries = [ x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0]) ] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty': next_doc['novelty'], 'data': copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: resampling_parameters = parameters['resampling'] if resampling_parameters.get('over', False): desired_size = None resampling_parameters['replacement'] = True else: desired_size = -np.Inf if resampling_parameters.get('replacement', False): replacement = True else: replacement = False logger.debug("Replacement: {}, Desired size: {}".format( replacement, desired_size)) logger.debug("Size of data: {}, Number of clusters: {}".format( len(corpus_unprocessed), len(all_clusters))) corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [record['body_text'] for record in case['data']] case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ] case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data']][-1] postids.append(postid) clusterid = [record['cluster_id'] for record in case['data']][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] doc_no_stop_words = case_docs_no_stop_words[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations( doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words, bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn( normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype) # Fail catastrphically on zero vector (not sure if we need this) #assert not (feature_vectors < 0.0001).all() data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) # save to HDF5 if desired if hdf5_path is not None and len(data) % hdf5_save_frequency == 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) labels = list() data = list() # Save off any remainder if hdf5_path is not None and len(data) > 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = [ "C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids, postids) ] if 'mem_net' in features: return mem_net_features, labels, ids if hdf5_path is not None: return hdf5_path, hdf5_path, ids else: return data, labels, ids
def test_lower_case(): assert normalize.normalize_and_remove_stop_words("Hi BillyBob JOE") == "hi billybob joe"
def test_letters(): assert normalize.normalize_and_remove_stop_words("19 cats&dogs don't eat?") == "cats dogs don t eat"
def test_HTML(): assert normalize.normalize_and_remove_stop_words("<p> <title=cats> cats pounce </p>") == "cats pounce"
def test_combo(): assert normalize.normalize_and_remove_stop_words( "<p> <title=cats> <body> Cats pounce all the time! <http://catlink.com> is a video of cats JUMPING 10 times!! cool, right? </body></p>" ) == "cats pounce time video cats jumping times cool right"
def test_stop_words_text(): assert normalize.normalize_and_remove_stop_words( "the cat has name") == "cat"
def test_lower_case(): assert normalize.normalize_and_remove_stop_words( "Hi BillyBob JOE") == "hi billybob joe"
def test_stop_words_text(): assert normalize.normalize_and_remove_stop_words("the cat has name") == "cat"
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (???): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster # sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: if 'over' in parameters: desired_size = None parameters['replacement'] = True else: desired_size = -np.Inf if 'replacement' in parameters: replacement = True else: replacement = False corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data'] ][-1] postids.append(postid) clusterid = [ record['cluster_id'] for record in case['data'] ][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0) data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)] if 'mem_net' in features: return mem_net_features, labels, ids else: return data, labels, ids
def test_combo(): assert normalize.normalize_and_remove_stop_words("<p> <title=cats> <body> Cats pounce all the time! <http://catlink.com> is a video of cats JUMPING 10 times!! cool, right? </body></p>") == "cats pounce time video cats jumping times cool right"
def gen_observations(all_clusters, lookup_order, documentData, filename, features, vocab, encoder_decoder, lda): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order documentData (array): parsed JSON documents filename (str): the name of the corpus file features (namedTuple): the specified features to be calculated vocab (dict): the vocabulary of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: list: contains for each obeservation a namedtupled with the cluster_id, post_id, novelty, tfidf sum, cosine similarity, bag of words vectors and skip thoughts (scores are None if feature is unwanted) ''' # Prepare to store results of feature assessments postScores = [] postTuple = namedtuple('postScore','corpus,cluster_id,post_id,novelty,bagwordsScore,tfidfScore,bog,skipthoughts,ldavector') #Iterate through clusters found in JSON file, do feature assessments, #build a rolling corpus from ordered documents for each cluster for cluster in all_clusters: # Determine arrival order in this cluster sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] first_doc = documentData[sortedEntries[0]]["body_text"] # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary corpus = normalize.normalize_and_remove_stop_words(first_doc) # Create a document array for TFIDF corpus_array = [corpus] # Store a list of sentences in the cluster at each iteration sentences = [] sentences += (get_first_and_last_sentence(first_doc)) # Use filename as corpus name if corpus name was not defined in JSON try: corpusName = documentData[sortedEntries[0]]["corpus"] except KeyError: corpusName = basename(filename) for index in sortedEntries[1:]: # Find next document in order raw_doc = documentData[index]["body_text"] #normalize and remove stop words from doc doc = normalize.normalize_and_remove_stop_words(raw_doc) corpus_array.append(doc) similarityScore = None tfidfScore = None bog = None skipthoughts = None ldavector = None if features.tfidf_sum: tfidfScore = tfidf_sum(doc, corpus_array, vocab) if features.cos_similarity: bagwordsVectors = bag_of_words_vectors(doc, corpus, vocab) similarityScore = 1 - spatial.distance.cosine(bagwordsVectors[0], bagwordsVectors[1]) if features.bag_of_words: bagwordsVectors = bag_of_words_vectors(doc, corpus, vocab) bog = np.concatenate(bagwordsVectors, axis=0) if features.skipthoughts: skipthoughts = skipthoughts_vectors(raw_doc, sentences, encoder_decoder) # Add newest sentence to sentences vector sentences += get_first_and_last_sentence(doc) if features.lda: doclda = run_lda(lda, doc, vocab) corpuslda = run_lda(lda,corpus,vocab) ldavector = np.concatenate([doclda, corpuslda], axis=0) # Save results in namedtuple and add to array postScore = postTuple(corpusName, cluster, documentData[index]["post_id"], documentData[index]["novelty"], similarityScore, tfidfScore, bog, skipthoughts,ldavector) postScores.append(postScore) # Update corpus corpus+=doc return postScores
def test_HTML(): assert normalize.normalize_and_remove_stop_words( "<p> <title=cats> cats pounce </p>") == "cats pounce"