Exemplo n.º 1
0
def test_one_hot():
    sample_text = 'The cat sat on the mat.'
    encoded = text.one_hot(sample_text, 5)
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 0

    sample_text = 'The-cat-sat-on-the-mat'
    encoded2 = text.one_hot(sample_text,
                            5,
                            analyzer=lambda t: t.lower().split('-'))
    assert encoded == encoded2
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 0
def fetch():
    i = 0
    content = ''
    list_of_reviews = []
    conn = create_connection("preprocessed_data")
    c = conn.cursor()
    data = c.execute("""SELECT review FROM movie_reviews LIMIT 5""")

    for review in data:
        content = content + ' ' + review[0]
        list_of_reviews.append(review[0])
        i += 1
        if i % 250 == 0: print("%s reviews have been fetched." % (i))
    tk = one_hot(content, split=" ", n=100000)
    print(tk)
    print("####################")
    vectorizer = CountVectorizer(min_df=0)
    vectorizer.fit(list_of_reviews)
    print(vectorizer.vocabulary_)
    print(vectorizer.transform(list_of_reviews).toarray())
    content = content.split()
    dictionary = set(content)
    conn.close()

    create_dictionary_database()

    connection = create_connection("dictionary")
    cur = connection.cursor()
    i = 0
    for word in dictionary:
        cur.execute("""INSERT INTO dictionary (word) VALUES (?)""", (word, ))
        i += 1
        if i % 500 == 0: print("%s words are in dictionary." % (i))
    connection.commit()
    connection.close()
Exemplo n.º 3
0
def one_hot(input_text,
            n,
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
            lower=True,
            split=' '):
  r"""One-hot encodes a text into a list of word indexes of size `n`.

  This function receives as input a string of text and returns a
  list of encoded integers each corresponding to a word (or token)
  in the given input string.

  Arguments:
      input_text: Input text (string).
      n: int. Size of vocabulary.
      filters: list (or concatenation) of characters to filter out, such as
        punctuation. Default:
        ```
        '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
        ```,
        includes basic punctuation, tabs, and newlines.
      lower: boolean. Whether to set the text to lowercase.
      split: str. Separator for word splitting.

  Returns:
      List of integers in `[1, n]`. Each integer encodes a word
      (unicity non-guaranteed).
  """
  return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
Exemplo n.º 4
0
def prep_1(text):
    text = "The quick brown fox jumped over the lazy dog."

    list_unique_words = list(set(text_to_word_sequence(text)))
    print(f"docs: {list_unique_words[:100]}")

    vocab_size = len(list_unique_words)
    print(f"vocab_size: {vocab_size}")

    oh_encoding = one_hot(text, n=round(vocab_size * 1.3))
    print(f"oh_encoding: {oh_encoding}")

    hashed_doc = hashing_trick(text,
                               n=round(vocab_size * 1.3),
                               hash_function='md5')
    print(f"hashed_doc: {hashed_doc}")

    return oh_encoding
Exemplo n.º 5
0
def prueba_1():
	docs = ['Well done!',
			'Good work',
			'Great effort',
			'nice work',
			'Excellent!',
			'Weak',
			'Poor effort!',
			'not good',
			'poor work',
			'Could have done better.']
	# define class labels
	labels = np.array([1,1,1,1,1,0,0,0,0,0])

	# integer encode the documents
	vocab_size = 50
	encoded_docs = [one_hot(d, vocab_size) for d in docs]
	print(encoded_docs)

	# pad documents to a max length of 4 words
	max_length = 4
	padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
	print(padded_docs)

	# define the model
	model = Sequential()
	model.add(Embedding(vocab_size, 8, input_length=max_length))
	model.add(Flatten())
	model.add(Dense(1, activation='sigmoid'))
	# compile the model
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
	# summarize the model
	print(model.summary())

	# fit the model
	model.fit(padded_docs, labels, epochs=50, verbose=0)
	# evaluate the model
	loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
	print('Accuracy: %f' % (accuracy*100))
Exemplo n.º 6
0
url = url[0:siz]


# no of dots in url
def label_encode(label):
    enclabel = []
    for i in label:
        if i == "bad":
            enclabel.append(0)
        else:
            enclabel.append(1)
    return enclabel


# one hot encode
encoded_docs = [one_hot(d, 20 * len(url)) for d in url]
leng = []
for i in encoded_docs:
    leng.append(len(i))
print(max(leng))
padded_docs = pad_sequences(encoded_docs, maxlen=max(leng), padding='post')

label = label_encode(label1)
label = label[0:siz]
la = label
label = np_utils.to_categorical(label, 2)

model = Sequential()
model.add(Embedding(siz, 32, input_length=max(leng)))
input_array = padded_docs
model.compile('rmsprop', 'mse')
Exemplo n.º 7
0
# print(one_hot_labels)
# quit()

CUSTOM_FILTERS = [
    lambda x: x.lower(), strip_multiple_whitespaces, strip_punctuation,
    remove_stopwords, stem_text
]
ppdocs = list()

for doc in _docs:
    word_list = preprocess_string(doc, CUSTOM_FILTERS)
    ppdocs.append(' '.join(word_list))

vocab_size = 4000
max_length = 30
encoded_docs = [one_hot(d, vocab_size) for d in ppdocs]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# print(padded_docs[0])
# X_train = np.array(encoded_docs)
# print(encoded_docs)
# quit()
# now develop the model
input_dim = max_length

model = Sequential()
# model.add(Dense(512, input_shape=(input_dim,)))
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Dropout(rate=0.5))
model.add(Flatten())
# model.add(LSTM(128))
model.add(Activation("relu"))
Exemplo n.º 8
0
def test_one_hot():
    sample_text = 'The cat sat on the mat.'
    encoded = text.one_hot(sample_text, 5)
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 0
Exemplo n.º 9
0
def get_padded_sentences(data_set_to_pad, vocab_size, longest_sentence):
    sentences = join_tokens(data_set_to_pad)
    encoded_sentences = [one_hot(sentence, vocab_size) for sentence in sentences]
    padded_sentences = pad_sequences(encoded_sentences, maxlen=longest_sentence, padding='post')
    return padded_sentences
Exemplo n.º 10
0
from keras_preprocessing.text import text_to_word_sequence
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.text import one_hot

text = "Hei, dette er noe testtext"

tronder_file = open("TextInput/rawText.txt", "r", encoding="utf-8")
tronder_text = tronder_file.read()
tronder_file.close()

one_hot_result = one_hot(tronder_text, len(tronder_text))
ttws_result = text_to_word_sequence(tronder_text)

print(ttws_result)
print(one_hot_result)
print(len(ttws_result))
print(len(one_hot_result))