def get_model(): print "Reading text file..." dir_path = os.getcwd() + text_file txt_file = UFile(dir_path) structure_obj = structure.Structure(txt_file.text) word_list = structure_obj.prepare_pure_list_of_words() word_list.append(unknown) vocabulary = sorted(list(set(word_list))) word_to_int, int_to_word = equivalent_word_to_int(vocabulary) structure_obj.generate_tags_dict() tags_dict = collections.OrderedDict(sorted(structure_obj.tags.items())) tag_to_int, int_to_tag = equivalent_tag_to_int(tags_dict) semantic_vector_obj = sv.SemanticVector(structure_obj) word2vec = prepare_word_2_vec(semantic_vector_obj) print "Start Modeling..." embedding_matrix = prepare_embedding(word_list, word2vec, word_to_int) nb_classes = len(vocabulary) model = modeling(embedding_matrix, len(word_list), 0.05, nb_classes) print len(word2vec.wv.vocab) train_X, train_y = prepare_multi_layer_train_sequence(tag_to_int, word_to_int, word_list, structure_obj.sentences_obj, len(vocabulary), is_sparse=True) test_X = prepare_test_sequences(tag_to_int, word_to_int) train_model(model, train_X, train_y, 1, 128, int_to_word, test_X, structure_obj, tag_to_int)
def model(self): struct = structure.Structure(self.file.text) seq_length = 15 word_list = struct.prepare_pure_list_of_words() # compute the vocabulary size vocabulary = sorted(list(set(word_list))) vocab_lenght = len(vocabulary) struct.generate_tags_dict() # semantic modeling semantic = StructureModel.semantic_model(struct) # tags modeling tag_dict, tag_model = StructureModel.tags_model(struct, seq_length) #data prepration #tags data tag_list = struct.tagged_text.split() tag_set = sorted(list(set(tag_list))) tags_array, tags_to_int, int_to_tags, tagsX, tagsY = StructureModel.data_preparation(tag_set, seq_length, tag_list) #words data words_array, words_to_int, int_to_words, wordsX, wordsY = StructureModel.data_preparation(vocabulary, seq_length, word_list) nb_patterns = len(wordsX) print 'nb_patt', nb_patterns # word modeling word_model = StructureModel.word_model(struct, seq_length, vocab_lenght, nb_patterns) model = StructureModel.combine_model(struct, tag_model, word_model, seq_length, vocab_lenght)
def get_model(): print "Reading text file..." txt_file = UFile(text_file) chars, char_to_int, int_to_char = discover_characters(txt_file.text) structure_obj = structure.Structure(txt_file.text) word_list = structure_obj.prepare_pure_list_of_words() vocabulary = sorted(list(set(word_list))) word_to_int, int_to_word = equivalent_word_to_int(vocabulary) semantic_vector_obj = sv.SemanticVector(structure_obj) word2vec = prepare_word_2_vec(semantic_vector_obj) print "Start Modeling..." embedding_matrix = prepare_embedding(word_list, word2vec, word_to_int) nb_classes = len(vocabulary) model = word2vec_model(embedding_matrix, len(word_list), 0.05, nb_classes, len(chars)) print len(word2vec.wv.vocab) train_X, train_y = prepare_train_sequences_for_sparse( word_to_int, word_list, structure_obj.sentences_obj) train_y = generate_sequence_character(chars, char_to_int, structure_obj.sentences_obj, len(train_X)) train_model(model, train_X, train_y, 1, 128, int_to_word, word2vec, word_to_int)
def model(self): struct = structure.Structure(self.file.text) seq_length = 7 word_list = struct.prepare_pure_list_of_words() # compute the vocabulary size vocabulary = sorted(list(set(word_list))) vocab_lenght = len(vocabulary) struct.generate_tags_dict() # semantic modeling semantic = StructureModel.semantic_model(struct, seq_length, w2v_size) StructureModel.word_model(struct, seq_length, semantic.model, word_list, vocabulary, vocab_lenght)
def load_test_data(cls, seq_length, word_to_int): test_data_file = UFile('test_hafez.txt') test_data_structure = structure.Structure(test_data_file.text) test_data_word_list = test_data_structure.prepare_pure_list_of_words() dataX = [] for i in range(0, len(test_data_word_list) - seq_length): words = test_data_word_list seq_in = words[i:i + seq_length] tempX = [] for word in seq_in: if word in word_to_int: tempX.append(word_to_int[word]) else: tempX.append(0) dataX.append(tempX) return dataX
def prepare_test_sequences(tag2int, word2int): print "preparing test sequences" txt_file = UFile(os.getcwd() + test_text_file) structure_obj = structure.Structure(txt_file.text) word_list = structure_obj.prepare_pure_list_of_words() structure_obj.generate_tags_dict() vocabulary = sorted(list(set(word_list))) test_tagX = prepare_tag_train_sequences(tag2int, structure_obj.sentences_obj) test_word_X, test_word_Y = prepare_train_sequences( word2int, word_list, structure_obj.sentences_obj, len(vocabulary), is_test=True) return [test_tagX, test_word_X]
def get_model(): print "Reading text file..." dir_path = os.getcwd() + text_file txt_file = UFile(dir_path) structure_obj = structure.Structure(txt_file.text) word_list = structure_obj.prepare_pure_list_of_words() vocabulary = sorted(list(set(word_list))) word_to_int, int_to_word = equivalent_word_to_int(vocabulary) semantic_vector_obj = sv.SemanticVector(structure_obj) word2vec = prepare_word_2_vec(semantic_vector_obj) print "Start Modeling..." embedding_matrix = prepare_embedding(word_list, word2vec, word_to_int) nb_classes = len(vocabulary) model = word2vec_model(embedding_matrix, len(word_list), 0.05, nb_classes) print len(word2vec.wv.vocab) train_X, train_y = prepare_train_sequences_for_sparse( word_to_int, word_list, structure_obj.sentences_obj) train_model(model, train_X, train_y, 1, 128, int_to_word, word2vec, word_to_int)
def tags_model(cls, structure, seq_length, word2vec): total = 0 for t in structure.sentences_obj: total += t.sentence_len avg = total / len(structure.sentences_obj) print "average length of sentence", avg tags_dict = collections.OrderedDict(sorted(structure.tags.items())) tags_len = len(tags_dict) word_list = structure.prepare_pure_list_of_words() vocabulary = sorted(list(set(word_list))) word_to_int = dict((c, i) for i, c in enumerate(vocabulary)) int_to_word = dict((i, c) for i, c in enumerate(vocabulary)) tag_to_int = dict((c, i) for i, c in enumerate(tags_dict)) int_to_tag = dict((i, c) for i, c in enumerate(tags_dict)) dataX = [] wordsX = [] dataY = [] tagged_text = structure.tagged_text.split() n_tags_in_text = len(tagged_text) for i in range(0, n_tags_in_text - seq_length, 1): seq_in = tagged_text[i:i + seq_length] word_in = word_list[i:i + seq_length] seq_out = tagged_text[i + seq_length] dataX.append([tag_to_int[char] for char in seq_in]) wordsX.append([word_to_int[word] for word in word_in]) dataY.append(tag_to_int[seq_out]) n_patterns = len(dataX) # reshape X to be [samples, time steps, features] X = numpy.reshape(dataX, (n_patterns, seq_length, 1)) # normalize X = X / float(tags_len) # one hot encode the output variable y = np_utils.to_categorical(dataY) print y.shape # define the LSTM model tag_model = Sequential() nn = 16 tag_model.add( GRU(nn * 4, return_sequences=True, input_shape=(X.shape[1], X.shape[2]))) tag_model.add(Dropout(0.02)) tag_model.add(GRU(nn * 3, return_sequences=True)) tag_model.add(Dropout(0.02)) tag_model.add(GRU(nn * 2, return_sequences=True)) tag_model.add(Dropout(0.02)) tag_model.add(GRU(nn * 1, return_sequences=False)) tag_model.add(Dropout(0.02)) tag_model.add(Dense(y.shape[1], activation='sigmoid')) tag_model.add(Dropout(0.02)) # # load the network weights tag_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # testing for rn in range(1): print rn tag_model.fit(X, y, nb_epoch=1, batch_size=512) # , callbacks=callbacks_list) # pick a random seed start = numpy.random.randint(0, len(dataX) - 1) pattern = dataX[start] word_pattern = wordsX[start] print "Seed:" # print "\"", ' '.join([int_to_tag[value] for value in pattern]), "\"" print "\"", ' '.join( [int_to_word[value] for value in word_pattern]), "\"" rs = [] for i in range(1): x = numpy.reshape(pattern, (1, len(pattern), 1)) x = x / float(tags_len) prediction = tag_model.predict(x, verbose=0) # index = numpy.argmax(prediction[0]) print prediction print 'size: ', len(prediction) index = StructureModel.sample(prediction[0], 2.0) result = int_to_tag[index] w_p_list = w.word_tokenize(" ".join(word_pattern)) word_window = w_p_list[len(pattern) - 5:len(pattern)] word_window = [int_to_word[value] for value in word_window] StructureModel.find_nearest_word(word2vec, word_window, result, structure) # sys.stdout.write(result) # sys.stdout.write(" ") rs.append(index) pattern.append(index) pattern = pattern[1:len(pattern)] print "\nDone."