Пример #1
0
def main():
    text_tweets = pd.read_csv('../data/tweets_data.csv', delimiter='\t')

    X = text_tweets.Text.values

    print('All data has been loaded')

    SENTENCE_LENGTH = 167
    NUM = 100000

    tokenizer = Tokenizer(num_words=NUM)
    tokenizer.fit_on_texts(X)

    X_seq = get_sequences(tokenizer, X, SENTENCE_LENGTH)

    print('Input data has been tokenized')

    with open('../output/model.pkl', 'rb') as file:
        model = pickle.load(file)

    model.load_weights('../output/cnn-frozen-embeddings-37.hdf5')

    print('Model has been loaded')

    classes = np.array(['anger', 'happiness', 'love', 'neutral', 'sadness'])
    predictions = model.predict(X_seq)
    predicted_ix = np.apply_along_axis(lambda x: np.argmax(x), 1, predictions)

    text_tweets['class_prediction'] = pd.Series(
        np.apply_along_axis(lambda x: classes[x], 0, predicted_ix))

    text_tweets.to_csv('../output/predictions.csv', sep='\t')

    print('Predictions have been saved')
Пример #2
0
def process_data():
    path = '/content/drive/My Drive/Colab Notebooks/alexa_toy.json'
    # path = os.getcwd() + '/alexa_toy.json'

    with open(path) as f:
        data = json.load(f)

    # extract text and label
    text, label = [], []
    for k, v in data.items():
        for x in v['content']:
            text.append(x['message'].lower())
            label.append(x['sentiment'])

    # convert labels to index
    index, label_id = 0, {}
    for x in np.unique(label):
        label_id[x] = index
        index += 1
    label = [label_id[x] for x in label]

    # process text (for convenience, used keras tools)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    text = tokenizer.texts_to_sequences(text)
    text = pad_sequences(text, maxlen=50)

    train_x, test_x, train_y, test_y = train_test_split(text, label, test_size=0.05, shuffle=False, random_state=42)
    print ('training size : {} \t test size : {}'.format(len(train_y), len(test_y)))
    return train_x, test_x, train_y, test_y, tokenizer
    def createModel(self, text):
        self.embeddings_index = {}
        f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'),
                 encoding='utf')
        for line in f:
            values = line.split()
            word = ''.join(values[:-300])
            #word = values[0]
            coefs = np.asarray(values[-300:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(self.embeddings_index))
        tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS, lower=False)
        tokenizer.fit_on_texts(text)

        self.word_index = tokenizer.word_index
        pickle.dump(self.word_index, open("../Models/DeId/word_index.pkl",
                                          'wb'))

        self.embedding_matrix = np.zeros(
            (len(self.word_index) + 1, self.EMBEDDING_DIM))
        print(self.embedding_matrix.shape)
        for word, i in self.word_index.items():
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        self.embedding_layer = Embedding(len(self.word_index) + 1,
                                         self.EMBEDDING_DIM,
                                         weights=[self.embedding_matrix],
                                         input_length=70,
                                         trainable=True)
        self.model = Sequential()
        self.model.add(self.embedding_layer)
        self.model.add(
            Bidirectional(
                LSTM(150,
                     dropout=0.3,
                     recurrent_dropout=0.6,
                     return_sequences=True))
        )  #{'sum', 'mul', 'concat', 'ave', None}
        self.model.add(
            Bidirectional(
                LSTM(60,
                     dropout=0.2,
                     recurrent_dropout=0.5,
                     return_sequences=True)))
        self.model.add(
            SeqSelfAttention(attention_activation='sigmoid',
                             attention_width=12))
        self.model.add(TimeDistributed(Dense(
            9,
            activation='softmax')))  # a dense layer as suggested by neuralNer
        self.model.compile(loss="categorical_crossentropy",
                           optimizer='rmsprop',
                           metrics=['accuracy'])
        self.model.summary()
        pass
Пример #4
0
class DLModel(BenchmarkedModel):
    def __init__(self):
        super().__init__()
        max_features = 1024

        model = Sequential()
        model.add(Embedding(max_features, output_dim=256))
        model.add(LSTM(128))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation="sigmoid"))

        model.compile(loss="binary_crossentropy",
                      optimizer="rmsprop",
                      metrics=["accuracy"])
        self.clf = model
        self.vectorizer = Tokenizer()

    def fit(self, data, labels):
        self.vectorizer.fit_on_texts(data)
        processed_data = self.vectorizer.texts_to_matrix(data, mode="count")

        self.clf.fit(processed_data, labels, batch_size=16, epochs=10)

    def predict(self, data):
        processed_data = self.vectorizer.texts_to_matrix(data, mode="count")
        self.clf.predict(processed_data)
Пример #5
0
def prepare_data(data_set, length=None):
    #tokenize the data set
    bodies_tokenizer, headlines_tokenizer = (Tokenizer(), Tokenizer())

    #find the max length of each dataset
    bodies_max_length = 0
    headlines_max_length = 0
    if not length:
      bodies_max_length = data_set['articleBody'].map(lambda x : len(x.split())).max()
      headlines_max_length = data_set['Headline'].map(lambda x : len(x.split())).max()
    else:
      bodies_max_length = length[0]
      headlines_max_length = length[1]
    
    #fit the tokenizer on the data set
    bodies_tokenizer.fit_on_texts(data_set['articleBody'])
    headlines_tokenizer.fit_on_texts(data_set['Headline'])

    #convert the texts to sequences
    bodies_sequences = bodies_tokenizer.texts_to_sequences(data_set['articleBody'])
    headlines_sequences = headlines_tokenizer.texts_to_sequences(data_set['Headline'])

    #pad the data to be the max length
    bodies_sequences = pad_sequences(bodies_sequences, maxlen=bodies_max_length, padding='post', truncating='post')
    headlines_sequences = pad_sequences(headlines_sequences, maxlen=headlines_max_length, padding='post', truncating='post')

    
    return bodies_sequences, headlines_sequences, bodies_tokenizer.word_index, headlines_tokenizer.word_index, data_set['Stance']
Пример #6
0
    def train(self, X, y=None):
        X, y = self.augment_instances(X, y)

        #X_text = self.text_repr_model.fit_transform(X[:, self.args.TEXT_COL])

        X_text = X[:, self.args.TEXT_COL]

        self.max_features = 4000
        self.tokenizer = Tokenizer(num_words=self.max_features)
        self.tokenizer.fit_on_texts(X_text)
        X_text = self.tokenizer.texts_to_sequences(X_text)
        X_text = self.tokenizer.sequences_to_texts(X_text)

        self.text_rep_model = self.build_fit_w2v(X_text)

        X_text = self.transform_text_to_w2v(self.text_rep_model, X_text)

        X_all_feats = self.augment_features(X_text, X)

        pca = PCA(n_components=self.num_clusters,
                  random_state=self.args.random_state)
        pca.fit(X_all_feats)

        model = KMeans(init=pca.components_,
                       n_clusters=self.num_clusters,
                       n_init=1,
                       random_state=self.args.random_state)
        model.fit(X_all_feats)

        self.clf_model = model
Пример #7
0
def predict():
    test = pd.read_csv(os.path.join(data_path, 'test.csv'))
    test_hash = test['unique_hash']
    # print(test.head())
    test = test.reindex(np.random.permutation(test.index))
    test = test[['text', 'drug']]

    test['text_comb'] = test['text'] + test['drug']
    test.text_comb = test.text_comb.apply(remove_stopwords)
    # print(test.head())

    tk = Tokenizer(num_words=NB_WORDS,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                   lower=True,
                   split=" ")
    tk.fit_on_texts(test.text_comb)

    test_seq = tk.texts_to_sequences(test.text_comb)
    test_oh = one_hot_seq(test_seq)

    reg_model = models.load_model('./data/reg_model.h5')
    prediction = reg_model.predict_classes(test_oh)

    submission = pd.DataFrame({
        'unique_hash': test_hash,
        'sentiment': prediction,
    })
    submission.to_csv('./data/dl_submission.csv', index=False)
Пример #8
0
def main():
    reviews_df = get_data()
    print(reviews_df)
    # tokenize all content
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(reviews_df['Review'])
    num_encoder_tokens = len(tokenizer.word_index) + 1

    # create training and testing vars
    X_train, X_test, y_train, y_test = train_test_split(reviews_df['Review'], reviews_df['Numeric_Label'],
                                                        test_size=0.2, shuffle=True)

    max_review_length = 500
    X_train = get_encoded_padded_content(tokenizer, X_train, max_review_length)
    X_test = get_encoded_padded_content(tokenizer, X_test, max_review_length)
    # print(X_train)
    # print(y_train)
    # create the model
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(num_encoder_tokens, embedding_vecor_length, input_length=max_review_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model_history = model.fit(X_train, y_train, epochs=3, batch_size=4, validation_split=0.2)
    print(model_history.history)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1] * 100))
    pass
    def __init__(self, master=None):
        super().__init__(master)
        self.r = sr.Recognizer()
        self.lexical = LexicalAnalysis.LexicalAnalysis()
        self.master = master
        self.pack()
        self.create_widgets()
        self.running = False
        self.text = ""
        self.text_sequence = None

        self.stemmer = WordNetLemmatizer()

        df = pandas.read_csv("D:\\PycharmProjects\\ThesisWork\\Data\\EmotionDetection\\%_by_Emo_Full_Data_data (1).csv")

        df['Tweet'] = df['Tweet'].apply(self.clean)

        MAX_NB_WORDS = 50000
        # Max number of words in each tweet.
        self.MAX_SEQUENCE_LENGTH = 250
        self.tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        self.tokenizer.fit_on_texts(df['Tweet'].values)
        # Integer replacement
        X = self.tokenizer.texts_to_sequences(df['Tweet'].values)

        X = pad_sequences(X, maxlen=self.MAX_SEQUENCE_LENGTH)
        # Gets categorical values for the labels
        Y = pandas.get_dummies(df['Emotion']).values

        self.neuralNetwork = NeuralNetwork.NeuralNetwork(X.shape[1], 4)
        self.neuralNetwork.fit(X, Y)
Пример #10
0
def make_dictionaries(file_path,
                      src_dict_path=None,
                      tgt_dict_path=None,
                      encoding="utf-8",
                      min_freq=5,
                      **kwargs):
    if not os.path.isdir(file_path):

        sents, chunks = _parse_data(open(file_path, 'r', encoding=encoding))
    else:
        sents, chunks = _parse_data_from_dir(file_path)

    src_tokenizer = Tokenizer(**kwargs)
    tgt_tokenizer = Tokenizer(**kwargs)

    src_tokenizer.fit_on_texts(sents)
    tgt_tokenizer.fit_on_texts(chunks)

    src_sub = sum(map(lambda x: x[1] < min_freq, src_tokenizer.word_counts.items()))
    tgt_sub = sum(map(lambda x: x[1] < min_freq, tgt_tokenizer.word_counts.items()))

    src_tokenizer.num_words = len(src_tokenizer.word_index) - src_sub
    tgt_tokenizer.num_words = len(tgt_tokenizer.word_index) - tgt_sub

    if src_dict_path is not None:
        save_dictionary(src_tokenizer, src_dict_path, encoding=encoding)
    if tgt_dict_path is not None:
        save_dictionary(tgt_tokenizer, tgt_dict_path, encoding=encoding)

    return src_tokenizer, tgt_tokenizer
Пример #11
0
def data_preprocess(text_data, text_label):
    text_sentence = []
    temp = []
    for i in text_data:
        k = jieba.lcut(i)
        text_sentence.append(k)
        temp += [j for j in k if j not in k]
    # 构建词典
    tokenizer = Tokenizer(num_words=len(temp))
    tokenizer.fit_on_texts(text_sentence)
    # 文本序列化
    text_sentence = tokenizer.texts_to_sequences(text_sentence)
    text_sentence = pad_sequences(text_sentence, maxlen=64, padding='post')
    # 标签
    text_label = to_categorical(text_label)
    # 提取预训练好的词向量
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 60),
                                dtype=np.float32)
    file = open('wiki.zh.text.vector', encoding='utf-8')
    file = file.readlines()
    for text in file:
        text = text.split()
        if text[0] in temp:
            embedding_matrix[tokenizer.word_index[text[0]]] = text[1:]

    return text_sentence, text_label, tokenizer, embedding_matrix
Пример #12
0
def define_sequences(raw_string, seq_length):
    '''
    Codes input-string into readable format for RNN (converts letters to corresponding integers)
    
    Input: 
        raw_string = data in form of string
        seq_length = integer, amount of pre-sequential "letters" RNN will use to make next prediction 
    Output: 
        X = input-data for RNN, format - (N, seq_length, 1)
        y = labels for RNN (in form of overlay-mask), format - (N, #uniquesymbols)
        c_indices: dictionary of available symbols in input-text
    '''
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(raw_string)
    c_indices = tokenizer.word_index

    X = []
    y = []

    for i in range(len(raw_string) - seq_length):

        inputseq = raw_string[i:i + seq_length]
        outputseq = raw_string[i + seq_length]

        X.append([c_indices[char] for char in inputseq])
        y.append(c_indices[outputseq])

    return X, y, c_indices
Пример #13
0
def prueba_2():
	cantidad_twits=10
	# define class twits
	test = load_test()
	twits = preprocesing(test[:cantidad_twits, 0])
	print(f"\ntwiters:\n{twits}")
	# define class labels
	labels = test[:cantidad_twits, 1].astype('float32')
	print(f"\nlabels:\n{labels}")
	# prepare tokenizer
	t = Tokenizer()
	t.fit_on_texts(twits)
	vocab_size = len(t.word_index) + 1
	# integer encode the documents
	encoded_twits = t.texts_to_sequences(twits)
	print(f"\nencoded_twits:\n{encoded_twits}")
	# pad documents to a max length of 4 words
	# Calculo largo maximo
	mylen = np.vectorize(len)
	lens=mylen(encoded_twits)
	max_len=max(lens)
	#TODO: Contar el twtit mas largo
	max_length = max_len
	padded_twits = pad_sequences(encoded_twits, maxlen=max_length, padding='post')
	print(f"\npadded_twits:\n{padded_twits}")

	# load the whole embedding into memory
	embeddings_index = dict()
	f = open('fasttext.es.300.txt')
	for line in f:
		values = line.split()
		word = values[0]
		coefs = np.asarray(values[1:], dtype='float32')
		embeddings_index[word] = coefs
	f.close()
	print('Loaded %s word vectors.' % len(embeddings_index))

	# create a weight matrix for words in training docs
	embedding_matrix = np.zeros((vocab_size, 300))
	for word, i in t.word_index.items():
		embedding_vector = embeddings_index.get(word)
		if embedding_vector is not None:
			embedding_matrix[i] = embedding_vector

	# define model
	model = Sequential()
	e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
	model.add(e)
	model.add(Flatten())
	model.add(Dense(1, activation='sigmoid'))
	# compile the model
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
	# summarize the model
	print(model.summary())
	# fit the model
	model.fit(padded_twits, labels, epochs=50, verbose=0)
	# evaluate the model
	loss, accuracy = model.evaluate(padded_twits, labels, verbose=0)
	print('Accuracy: %f' % (accuracy * 100))
Пример #14
0
def create_keras_tokenizer(captions):
    """
    функция используется для создания tokenizer, и его обучения на наборе описаний
    """
    list_of_captions = captions_to_list(captions)
    keras_tokenizer = Tokenizer()
    keras_tokenizer.fit_on_texts(list_of_captions)
    return keras_tokenizer
Пример #15
0
def tokenizer(text):
    max_num_words = 8000
    tokenize = Tokenizer(num_words=max_num_words)
    tokenize.fit_on_texts(text)
    # print(tokenize.word_index)
    vocab_size = len(tokenize.word_index) + 1
    text2int = tokenize.texts_to_sequences(text)
    max_ln = np.max([len(cap) for cap in text2int])
    return [tokenize, text2int, vocab_size, max_ln]
    def __init__(self, articles: Articles, max_article_length: int):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(articles.title_and_summary())
        self.max_article_length: int = max_article_length

        self.sequences = self.transform_to_sequences(articles)
        self.voc_size = len(
            self.tokenizer.word_index) + 1  # +1 because we pad with 0.
        self.document_count = self.tokenizer.document_count
Пример #17
0
 def get_tokenized_data(self, max_sentence_len):
     sents, is_intent = self.get_data()
     # token_list = (data['sentence'].apply(get_tokens))
     token_list = [get_tokens(sent) for sent in sents]
     tokenizer = Tokenizer()
     tokenizer.fit_on_texts(token_list)
     X, Y = self.get_netio(is_intent, token_list, max_sentence_len,
                           tokenizer)
     return X, Y, tokenizer
Пример #18
0
def predict(input_sentence):
    # sentence = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing"
    max_fatures = 2000
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(input_sentence)
    X = tokenizer.texts_to_sequences(input_sentence)
    X = pad_sequences(X, maxlen=28)
    sentiment = model.predict(X, batch_size=1, verbose=2)[0]
    print(sentiment)
Пример #19
0
def preprocess_text(text, padding='post'):
    # tokeniz e
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)

    # padding
    sequences = pad_sequences(sequences, padding=padding)

    return sequences, len(tokenizer.word_index)
Пример #20
0
def get_data_as_one_hot(num_words, data_location='data/data', labels_location='data/labels'):
    data, labels = read_data_and_labels(data_location, labels_location)

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(data)
    one_hot = tokenizer.texts_to_matrix(data, mode='binary')
    encoded_labels = np.asarray(labels).astype('float32')

    print('Returning encoded text, labels and tokenizer')
    return one_hot, encoded_labels, tokenizer
Пример #21
0
    def initialize(self, query_manager, emb_path):
        self.tokenizer = Tokenizer()
        self.word_embedding = WordEmbedding(embfile=emb_path)

        self.query_list = query_manager.query_list

        query_list = [query.query for query in self.query_list]
        self.tokenizer.fit_on_texts(query_list)
        self.tokenizer.fit_on_texts(query_manager.corpus)
        self.word_embedding.create_embedding_matrix(self.tokenizer)
 def predict_intent(self, text):
     prepared_text = DataHandler.get_preprocessed_message(text)
     tokenizer = Tokenizer(num_words=vocabulary_size)
     tokenizer.fit_on_texts(prepared_text)
     X_temp = tokenizer.texts_to_sequences(prepared_text)
     X = pad_sequences(X_temp, padding='post', maxlen=max_input_length)
     print(X)
     result = self.intent_classifier.predict_classes(X)
     print("Res = " + str(result))
     print(most_common(result).item())
     return most_common(result).item()
Пример #23
0
def get_data_as_padded_sequences(num_words, max_length, data_location='data/data', labels_location='data/labels'):
    data, labels = read_data_and_labels(data_location, labels_location)

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    sequences = pad_sequences(sequences, maxlen=max_length)

    encoded_labels = np.asarray(labels).astype('float32')

    return sequences, encoded_labels, tokenizer
def run_tokenizer(train, test):
    logger.info('Fitting tokenizer')
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(
        list(train['comment_text']) + list(test['comment_text']))
    # X_train = tokenizer.texts_to_sequences(list(train['comment_text']))
    # X_test = tokenizer.texts_to_sequences(list(test['comment_text']))
    # X_train = pad_sequences(X_train, maxlen=MAX_LEN)
    # X_test = pad_sequences(X_test, maxlen=MAX_LEN)
    # word_index = tokenizer.word_index
    return tokenizer  # X_train, X_test, word_index
Пример #25
0
    def __init__(self,
                 wine_dataset: WineDataSet,
                 max_len,
                 topn_varieties: int = 7,
                 balance_class=False):
        filter_list = wine_dataset.varieties_count(
        )['variety'][:topn_varieties].tolist()
        filtered_df = wine_dataset.data[wine_dataset.data['variety'].isin(
            filter_list)]

        if balance_class:
            aux_df = deepcopy(filtered_df)
            d = aux_df.groupby('variety')
            d = d.apply(
                lambda x: x.sample(d.size().min()).reset_index(drop=True))
            d = d.reset_index(drop=True)
            filtered_df = d
            del aux_df, d

        wine_embeddings_filter = filtered_df.index.values

        self._varieties_list = from_array(filter_list)
        self._wine_embeddings_filter = from_array(wine_embeddings_filter)

        self._variety2index = {
            variety: index
            for index, variety in enumerate(filter_list)
        }
        self._index2variety = {
            index: variety
            for index, variety in enumerate(filter_list)
        }
        # self._X = wine_embeddings[wine_embeddings_filter].compute()

        self._X = deepcopy(filtered_df['description_cleaned'].tolist())
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self._X)
        self._X, self._X_tokenizer = tokenizer.texts_to_sequences(
            self._X), tokenizer
        self._X = pad_sequences(self._X,
                                maxlen=84,
                                padding="pre",
                                truncating="post")

        self._index2word = self._X_tokenizer.index_word
        self._word2index = self._X_tokenizer.word_index

        self._index2word.update({0: 'pad'})
        self._word2index.update({'pad': 0})

        self._Y = deepcopy(filtered_df['variety'])
        self._Y.replace(self._variety2index, inplace=True)
        self._Y = np.array(self._Y.tolist())
Пример #26
0
    def closure(mu):
        (x_train, y_train), (_, _) = imdb.load_data()
        tokenizer = Tokenizer(num_words=5000)
        tokenizer.fit_on_sequences(x_train)
        x_train = tokenizer.sequences_to_matrix(x_train, "tfidf")
        # Note: svd_solver=full is needed on GPU server
        x_train = PCA(n_components=100, svd_solver='full').fit_transform(x_train)
        ds = {"data": x_train, "target": y_train}

        # Apply noise and return
        res = preprocess_and_noise(dataset=ds, mu=mu)
        return res
Пример #27
0
    def createModel(self, text):
        self.embeddings_index = {}
        f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'))
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(self.embeddings_index))
        tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS, lower=False)
        tokenizer.fit_on_texts(text)

        self.word_index = tokenizer.word_index

        self.embedding_matrix = np.zeros(
            (len(self.word_index) + 1, self.EMBEDDING_DIM))
        print(self.embedding_matrix.shape)
        for word, i in self.word_index.items():
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        self.embedding_layer = Embedding(len(self.word_index) + 1,
                                         self.EMBEDDING_DIM,
                                         weights=[self.embedding_matrix],
                                         input_length=70,
                                         trainable=False)
        self.model = Sequential()
        self.model.add(self.embedding_layer)
        self.model.add(
            Bidirectional(
                LSTM(200,
                     dropout=0.3,
                     recurrent_dropout=0.7,
                     return_sequences=True))
        )  #{'sum', 'mul', 'concat', 'ave', None}
        # self.model.add(TimeDistributed(Bidirectional(LSTM(60, dropout=0.2, recurrent_dropout=0.5, return_sequences=True))))
        #self.model.add(TimeDistributed(Dense(50, activation='relu')))
        self.model.add(TimeDistributed(Dense(
            9,
            activation='softmax')))  # a dense layer as suggested by neuralNer
        #crf = CRF(17, sparse_target=True)
        #self.model.add(crf)
        #self.model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])
        self.model.compile(loss="categorical_crossentropy",
                           optimizer='rmsprop',
                           metrics=['accuracy'])
        self.model.summary()
        pass
Пример #28
0
def topXInSet(outFile, x):
    with open('reducedCombined(no gov).txt', 'r', encoding='utf-8') as file, \
            open(outFile, 'w', encoding='utf-8') as target:
        f = file.readlines()

        random.shuffle(f)

        tk = Tokenizer()
        tk.fit_on_texts(f)

        tfList = []

        start = perf_counter()

        # x = 6 -> 9000 words
        # x = 8 -> 7000 words
        # x = 10 -> 5500 words
        # x = 15 -> 3500 words
        # x = 20 -> 2750 words
        stopNum = round(1 / x * (len(tk.word_index)))

        for n in range(len(f)):
            keep = True
            for word in f[n].split():
                if keep is True:
                    for num, entry in enumerate(list(tk.word_index.keys())):
                        word = ''.join(
                            c for c in word if c not in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n').lower()
                        if entry == word:
                            break
                        if num == stopNum:
                            keep = False
                            break
            tfList.append(keep)

        stop = perf_counter()

        i = 0
        for tf in tfList:
            if tf:
                i = i + 1

        print("Trues:", i)
        print("Time to finish:", stop - start)

        for x in range(len(f)):
            if tfList[x] is True:
                target.write(f[x])

        print("\n" + str(stopNum))
        print(len(f))
        print(len(tfList))
Пример #29
0
def tokenize_http_status(data):

    if config.SAVE:
        tokenizer = Tokenizer(num_words=20, filters='', oov_token=0)
        tokenizer.fit_on_texts(data.astype(str))
        save_tokenizer(tokenizer, "status")

    if not config.SAVE:
        tokenizer = load_tokenizer("status")

    data = tokenizer.texts_to_sequences(data.astype(str))
    data = numpy.array(data)
    return data
Пример #30
0
    def run(self):
        self.dataset.load()

        X = self.dataset.X_train_labeled['moment'].values
        X = np.append(X,
                      self.dataset.X_train_unlabeled['moment'].values,
                      axis=0)
        X = np.append(X, self.dataset.X_test['moment'].values, axis=0)

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X)

        self.build_embedding(tokenizer.word_index)
Пример #31
0
def tokenlize_text(max_num_words, max_seq_length, x_train):
    """Tokenlize text.

    Vectorize a text corpus by transform each text in texts to a sequence of integers.

    Args:
        max_num_words: Int, max number of words in the dictionary.
        max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer.
        x_train: List contains text data.

    Returns:
        x_train: Tokenlized input data.
        word_index: Dictionary contains word with tokenlized index.
    """
    from keras_preprocessing.sequence import pad_sequences
    from keras_preprocessing.text import Tokenizer
    print("tokenlizing texts...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(x_train)
    sequences = tokenizer.texts_to_sequences(x_train)
    word_index = tokenizer.word_index
    x_train = pad_sequences(sequences, maxlen=max_seq_length)
    print("data readed and convert to %d length sequences" % max_seq_length)
    return x_train, word_index