Exemplo n.º 1
0
def process_sent(doc,
                 word2vec,
                 vocab,
                 ivocab,
                 word_vector_size,
                 to_return="word2vec",
                 silent=False,
                 encoder_decoder=None,
                 vocab_dict={}):
    document_vector = []

    if to_return == "word2vec":
        document_vector = [
            process_word(w,
                         word2vec,
                         vocab,
                         ivocab,
                         word_vector_size,
                         to_return,
                         silent=True) for w in doc
        ]
    elif to_return == "skip_thought":
        sentences = punkt_sentences(doc)
        norm_sentences = [normalize.xml_normalize(s) for s in sentences]
        document_vector = [sk.encode(encoder_decoder, norm_sentences)]
    elif to_return == "one_hot":
        data_gen.run_onehot(doc, vocab_dict)

    return document_vector
Exemplo n.º 2
0
def process_sent(doc, word2vec, vocab, ivocab, word_vector_size, to_return="word2vec", silent=False, encoder_decoder=None, vocab_dict={}):
    document_vector = []

    if to_return=="word2vec":
        document_vector = [process_word(w, word2vec, vocab, ivocab , word_vector_size, to_return, silent=True) for w in doc]
    elif to_return=="skip_thought":
        sentences = punkt_sentences(doc)
        norm_sentences = [normalize.xml_normalize(s) for s in sentences]
        document_vector = [ sk.encode(encoder_decoder, norm_sentences)]
    elif to_return=="one_hot":
        data_gen.run_onehot(doc, vocab_dict)

    return document_vector
Exemplo n.º 3
0
    def transform_doc(self, doc, corpus):

        hot_doc = data_gen.run_onehot(doc, self.vocab_dict, self.doc_length, self.doc_length)
        hot_corpus = data_gen.run_onehot(corpus, self.vocab_dict, self.doc_length, self.doc_length)

        hot_docs = []
        hot_docs.append(hot_doc)
        hot_docs.append(hot_corpus)
        fake_labels = []
        fake_labels.append(self.hot_fake_label)
        fake_labels.append(self.hot_fake_label)
        #run the network for both docs as once as the input expects a list and it is likely faster
        #the labels aren't used so it doesn't matter what those are
        connected_layers = self.session.run(self.connected_layer, feed_dict={self.x: hot_docs, self.y: fake_labels})
        return connected_layers
Exemplo n.º 4
0
    def prep_data(self, in_data, vocab, min_length, max_length):
        documents = []
        labels = []

        for entry in in_data:
            text = entry["body_text"]
            documents.append(
                data_gen.run_onehot(text, vocab, min_length, max_length))
            label = entry["cluster_id"]
            labels.append(label)

        #encode the labels in a dictionary
        unique_labels = np.unique(labels)
        i = 0
        unique_label_dict = {}
        for u_c in unique_labels:
            unique_label_dict[u_c] = i
            i += 1

        hot_labels = []
        n_classes = len(unique_labels)
        for c in labels:
            cluster_vect = np.zeros(n_classes, dtype=int)
            cluster_vect[unique_label_dict[c]] = 1
            hot_labels.append(cluster_vect.tolist())

        return documents, hot_labels, n_classes
Exemplo n.º 5
0
    def prep_news_data(self, vocab, min_length, max_length):
        from sklearn.datasets import fetch_20newsgroups
        newsgroups = fetch_20newsgroups()

        documents = [
            data_gen.run_onehot(normalize.xml_normalize(text), vocab,
                                min_length, max_length)
            for text in newsgroups.data
        ]
        labels = newsgroups.target

        #encode the labels in a dictionary
        unique_labels = np.unique(labels)
        i = 0
        unique_label_dict = {}
        for u_c in unique_labels:
            unique_label_dict[u_c] = i
            i += 1

        hot_labels = []
        n_classes = len(unique_labels)
        for c in labels:
            cluster_vect = np.zeros(n_classes, dtype=int)
            cluster_vect[unique_label_dict[c]] = 1
            hot_labels.append(cluster_vect.tolist())

        return documents, hot_labels, n_classes
Exemplo n.º 6
0
    def prep_data(self, in_data, vocab, min_length, max_length):
        documents = []
        labels = []

        for entry in in_data:
            text = entry["body_text"]
            documents.append(data_gen.run_onehot(text, vocab, min_length, max_length))
            label = entry["cluster_id"]
            labels.append(label)

        #encode the labels in a dictionary
        unique_labels = np.unique(labels)
        i = 0
        unique_label_dict = {}
        for u_c in unique_labels:
            unique_label_dict[u_c] = i
            i +=1

        hot_labels = []
        n_classes = len(unique_labels)
        for c in labels:
            cluster_vect = np.zeros(n_classes, dtype=int)
            cluster_vect[unique_label_dict[c]]=1
            hot_labels.append(cluster_vect.tolist())

        return documents, hot_labels, n_classes
Exemplo n.º 7
0
def test_onehot():
    """Test one-hot document generation"""

    doc = ["hello", "you", "wanton", "civet", ",", "you"]
    vocab = {"hello": 0, "you": 1, "civet": 2, ",": 3}

    doc_onehot = data_gen.run_onehot(doc, vocab)
    doc_onehot_minlength = data_gen.run_onehot(doc, vocab, min_length=10)
    doc_onehot_maxlength = data_gen.run_onehot(doc, vocab, max_length=2)
    doc_onehot_encoded = np.array([[1., 0., 0., 0., 0.], [0., 1., 0., 0., 1.],
                                   [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.]],
                                  dtype=np.float32)

    # encoding is correct
    assert (doc_onehot == doc_onehot_encoded).all()
    # minimum length correctly enforced
    assert doc_onehot_minlength.shape == (4, 10)
    # maximum length correctly enforced
    assert doc_onehot_maxlength.shape == (4, 2)
Exemplo n.º 8
0
    def transform_doc(self, doc, corpus):

        hot_doc = data_gen.run_onehot(doc, self.vocab_dict, self.doc_length,
                                      self.doc_length)
        hot_corpus = data_gen.run_onehot(corpus, self.vocab_dict,
                                         self.doc_length, self.doc_length)

        hot_docs = []
        hot_docs.append(hot_doc)
        hot_docs.append(hot_corpus)
        fake_labels = []
        fake_labels.append(self.hot_fake_label)
        fake_labels.append(self.hot_fake_label)
        #run the network for both docs as once as the input expects a list and it is likely faster
        #the labels aren't used so it doesn't matter what those are
        connected_layers = self.session.run(self.connected_layer,
                                            feed_dict={
                                                self.x: hot_docs,
                                                self.y: fake_labels
                                            })
        return connected_layers
Exemplo n.º 9
0
    def prep_news_data(self, vocab, min_length, max_length):
        from sklearn.datasets import fetch_20newsgroups
        newsgroups= fetch_20newsgroups()

        documents = [data_gen.run_onehot(normalize.xml_normalize(text), vocab, min_length, max_length)
                     for text in newsgroups.data]
        labels = newsgroups.target

        #encode the labels in a dictionary
        unique_labels = np.unique(labels)
        i = 0
        unique_label_dict = {}
        for u_c in unique_labels:
            unique_label_dict[u_c] = i
            i +=1

        hot_labels = []
        n_classes = len(unique_labels)
        for c in labels:
            cluster_vect = np.zeros(n_classes, dtype=int)
            cluster_vect[unique_label_dict[c]]=1
            hot_labels.append(cluster_vect.tolist())

        return documents, hot_labels, n_classes