def process_sent(doc, word2vec, vocab, ivocab, word_vector_size, to_return="word2vec", silent=False, encoder_decoder=None, vocab_dict={}): document_vector = [] if to_return == "word2vec": document_vector = [ process_word(w, word2vec, vocab, ivocab, word_vector_size, to_return, silent=True) for w in doc ] elif to_return == "skip_thought": sentences = punkt_sentences(doc) norm_sentences = [normalize.xml_normalize(s) for s in sentences] document_vector = [sk.encode(encoder_decoder, norm_sentences)] elif to_return == "one_hot": data_gen.run_onehot(doc, vocab_dict) return document_vector
def process_sent(doc, word2vec, vocab, ivocab, word_vector_size, to_return="word2vec", silent=False, encoder_decoder=None, vocab_dict={}): document_vector = [] if to_return=="word2vec": document_vector = [process_word(w, word2vec, vocab, ivocab , word_vector_size, to_return, silent=True) for w in doc] elif to_return=="skip_thought": sentences = punkt_sentences(doc) norm_sentences = [normalize.xml_normalize(s) for s in sentences] document_vector = [ sk.encode(encoder_decoder, norm_sentences)] elif to_return=="one_hot": data_gen.run_onehot(doc, vocab_dict) return document_vector
def transform_doc(self, doc, corpus): hot_doc = data_gen.run_onehot(doc, self.vocab_dict, self.doc_length, self.doc_length) hot_corpus = data_gen.run_onehot(corpus, self.vocab_dict, self.doc_length, self.doc_length) hot_docs = [] hot_docs.append(hot_doc) hot_docs.append(hot_corpus) fake_labels = [] fake_labels.append(self.hot_fake_label) fake_labels.append(self.hot_fake_label) #run the network for both docs as once as the input expects a list and it is likely faster #the labels aren't used so it doesn't matter what those are connected_layers = self.session.run(self.connected_layer, feed_dict={self.x: hot_docs, self.y: fake_labels}) return connected_layers
def prep_data(self, in_data, vocab, min_length, max_length): documents = [] labels = [] for entry in in_data: text = entry["body_text"] documents.append( data_gen.run_onehot(text, vocab, min_length, max_length)) label = entry["cluster_id"] labels.append(label) #encode the labels in a dictionary unique_labels = np.unique(labels) i = 0 unique_label_dict = {} for u_c in unique_labels: unique_label_dict[u_c] = i i += 1 hot_labels = [] n_classes = len(unique_labels) for c in labels: cluster_vect = np.zeros(n_classes, dtype=int) cluster_vect[unique_label_dict[c]] = 1 hot_labels.append(cluster_vect.tolist()) return documents, hot_labels, n_classes
def prep_news_data(self, vocab, min_length, max_length): from sklearn.datasets import fetch_20newsgroups newsgroups = fetch_20newsgroups() documents = [ data_gen.run_onehot(normalize.xml_normalize(text), vocab, min_length, max_length) for text in newsgroups.data ] labels = newsgroups.target #encode the labels in a dictionary unique_labels = np.unique(labels) i = 0 unique_label_dict = {} for u_c in unique_labels: unique_label_dict[u_c] = i i += 1 hot_labels = [] n_classes = len(unique_labels) for c in labels: cluster_vect = np.zeros(n_classes, dtype=int) cluster_vect[unique_label_dict[c]] = 1 hot_labels.append(cluster_vect.tolist()) return documents, hot_labels, n_classes
def prep_data(self, in_data, vocab, min_length, max_length): documents = [] labels = [] for entry in in_data: text = entry["body_text"] documents.append(data_gen.run_onehot(text, vocab, min_length, max_length)) label = entry["cluster_id"] labels.append(label) #encode the labels in a dictionary unique_labels = np.unique(labels) i = 0 unique_label_dict = {} for u_c in unique_labels: unique_label_dict[u_c] = i i +=1 hot_labels = [] n_classes = len(unique_labels) for c in labels: cluster_vect = np.zeros(n_classes, dtype=int) cluster_vect[unique_label_dict[c]]=1 hot_labels.append(cluster_vect.tolist()) return documents, hot_labels, n_classes
def test_onehot(): """Test one-hot document generation""" doc = ["hello", "you", "wanton", "civet", ",", "you"] vocab = {"hello": 0, "you": 1, "civet": 2, ",": 3} doc_onehot = data_gen.run_onehot(doc, vocab) doc_onehot_minlength = data_gen.run_onehot(doc, vocab, min_length=10) doc_onehot_maxlength = data_gen.run_onehot(doc, vocab, max_length=2) doc_onehot_encoded = np.array([[1., 0., 0., 0., 0.], [0., 1., 0., 0., 1.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.]], dtype=np.float32) # encoding is correct assert (doc_onehot == doc_onehot_encoded).all() # minimum length correctly enforced assert doc_onehot_minlength.shape == (4, 10) # maximum length correctly enforced assert doc_onehot_maxlength.shape == (4, 2)
def transform_doc(self, doc, corpus): hot_doc = data_gen.run_onehot(doc, self.vocab_dict, self.doc_length, self.doc_length) hot_corpus = data_gen.run_onehot(corpus, self.vocab_dict, self.doc_length, self.doc_length) hot_docs = [] hot_docs.append(hot_doc) hot_docs.append(hot_corpus) fake_labels = [] fake_labels.append(self.hot_fake_label) fake_labels.append(self.hot_fake_label) #run the network for both docs as once as the input expects a list and it is likely faster #the labels aren't used so it doesn't matter what those are connected_layers = self.session.run(self.connected_layer, feed_dict={ self.x: hot_docs, self.y: fake_labels }) return connected_layers
def prep_news_data(self, vocab, min_length, max_length): from sklearn.datasets import fetch_20newsgroups newsgroups= fetch_20newsgroups() documents = [data_gen.run_onehot(normalize.xml_normalize(text), vocab, min_length, max_length) for text in newsgroups.data] labels = newsgroups.target #encode the labels in a dictionary unique_labels = np.unique(labels) i = 0 unique_label_dict = {} for u_c in unique_labels: unique_label_dict[u_c] = i i +=1 hot_labels = [] n_classes = len(unique_labels) for c in labels: cluster_vect = np.zeros(n_classes, dtype=int) cluster_vect[unique_label_dict[c]]=1 hot_labels.append(cluster_vect.tolist()) return documents, hot_labels, n_classes