className2Id['I_PERSON'] = 6 num_categories = len(className2Id) print('Loading data... (Train)') (X1, y_train) = deepctxt_util.load_sequence_data_x_y('./data/train.cleaned.tsv', className2Id) y_train = encode_category_vector.one_hot_category(y_train, num_categories) print('Done') print('Loading data... (Test)') (X3, y_test) = deepctxt_util.load_sequence_data_x_y('./data/test.cleaned.tsv', className2Id) y_test = encode_category_vector.one_hot_category(y_test, num_categories) print('Done') print('Converting data... (Train)') X_train = tokenizer.texts_to_sequences(X1, maxlen) print('Done') print('Converting data... (Test)') X_test = tokenizer.texts_to_sequences(X3, maxlen) print('Done') print(len(X_train), 'y_train sequences') print(len(X_test), 'y_test sequences') print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) Y_train = sequence.pad_sequences(y_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) Y_test = sequence.pad_sequences(y_test, maxlen=maxlen)
(x_test, y_test_class ) = deepctxt_util.load_sequence_raw_data_x_y('./data/test.cleaned.tsv') y_test = [] for class_name_array in y_test_class: y = [] for class_name in class_name_array: class_id = className2Id[class_name] y.append(class_id) y_test.append(y) y_test = encode_category_vector.one_hot_category(y_test, num_categories) print('Done') print('Converting data... (Test)') x_test = tokenizer.texts_to_sequences(x_test, maxlen) print('Done') print(len(x_test), 'y_test sequences') #print("Pad sequences (samples x time)") X_test = sequence.pad_sequences(x_test, maxlen=maxlen) Y_test = sequence.pad_sequences(y_test, maxlen=maxlen) #print('X_test shape:', X_test.shape) print('Load model...') file = open('./query_ner_birnn_lstm_glove_100.15b.json', 'rb') model_string = file.read() file.close() model = model_from_json(model_string) model.load_weights('./query_ner_birnn_lstm_glove_100.15b.h5')
print('Loading data... (Train)') (X1, y_train) = deepctxt_util.load_sequence_data_x_y('./data/train.cleaned.tsv', className2Id) #(X1, y_train) = deepctxt_util.load_sequence_data_x_y('./data/test.cleaned.tsv', className2Id) y_train = encode_category_vector.one_hot_category(y_train, num_categories) print('Done') print('Loading data... (Test)') (X3, y_test) = deepctxt_util.load_sequence_data_x_y('./data/test.cleaned.tsv', className2Id) y_test = encode_category_vector.one_hot_category(y_test, num_categories) print('Done') print('Converting data... (Train)') X_train = tokenizer.texts_to_sequences(X1, maxlen) print('Done') print('Converting data... (Test)') X_test = tokenizer.texts_to_sequences(X3, maxlen) print('Done') print(len(X_train), 'y_train sequences') print(len(X_test), 'y_test sequences') print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) Y_train = sequence.pad_sequences(y_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) Y_test = sequence.pad_sequences(y_test, maxlen=maxlen) print('X_train shape:', X_train.shape)
#tokenizer.load('./glove.42B.300d.txt') print('Done') print('Loading model') with open("./coarse_type_model_lstm_glove_100b.json", "r") as f: json_string = f.readline() model = model_from_json(json_string) print('Done') print('Compile model') model.compile(loss='categorical_crossentropy', optimizer='adam') print('Done') print('Loading weights') model.load_weights('./coarse_type_model_lstm_glove_100b.h5') print('Done') idx2type = {0:"DESCRIPTION", 1:"NUMERIC", 2:"ENTITY", 3:"PERSON", 4:"LOCATION"} while True: print("===============================================") query = raw_input('Enter query: ') X1 = [] X1.append(query) X2 = tokenizer.texts_to_sequences(X1, maxlen) X = sequence.pad_sequences(X2, maxlen=maxlen) pred = model.predict_proba(X, batch_size=1) idx = np.argmax(pred[0]) print("Type=" + idx2type[idx]) print(pred)
tokenizer.load('./glove.6B.100d.txt') #tokenizer.load('./glove.42B.300d.txt') print('Done') max_features = tokenizer.n_symbols vocab_dim = tokenizer.vocab_dim print('Loading data... (Test)') #(X2, y_test) = deepctxt_util.load_raw_data_x_y(path='./raw_data/bing_query.tsv', y_shift=0) (X2, y_test) = deepctxt_util.load_raw_data_x_y( path='./raw_data/person_birthday_deep_learning_eval_rawquery_cleaned.tsv', y_shift=0) print('Done') print('Converting data... (Test)') X_test = tokenizer.texts_to_sequences(X2, maxlen) print('Done') print(len(X_test), 'y_test sequences') nb_classes = np.max(y_test) + 1 Y_test = np_utils.to_categorical(y_test, nb_classes) print('Y_test shape:', Y_test.shape) print("Pad sequences (samples x time)") X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_test shape:', X_test.shape) print('Load model...')
print('Loading tokenizer') tokenizer.load('./glove.6B.100d.txt') #tokenizer.load('./glove.42B.300d.txt') print('Done') max_features = tokenizer.n_symbols vocab_dim = tokenizer.vocab_dim print('Loading data... (Test)') #(X2, y_test) = deepctxt_util.load_raw_data_x_y(path='./raw_data/bing_query.tsv', y_shift=0) (X2, y_test) = deepctxt_util.load_raw_data_x_y(path='./raw_data/person_birthday_deep_learning_eval_rawquery_cleaned.tsv', y_shift=0) print('Done') print('Converting data... (Test)') X_test = tokenizer.texts_to_sequences(X2, maxlen) print('Done') print(len(X_test), 'y_test sequences') nb_classes = np.max(y_test)+1 Y_test = np_utils.to_categorical(y_test, nb_classes) print('Y_test shape:', Y_test.shape) print("Pad sequences (samples x time)") X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_test shape:', X_test.shape) print('Load model...')
model = model_from_json(json_string) print('Done') print('Compile model') model.compile(loss='categorical_crossentropy', optimizer='adam') print('Done') print('Loading weights') model.load_weights('./coarse_type_model_lstm_glove_100b.h5') print('Done') idx2type = { 0: "DESCRIPTION", 1: "NUMERIC", 2: "ENTITY", 3: "PERSON", 4: "LOCATION" } while True: print("===============================================") query = raw_input('Enter query: ') X1 = [] X1.append(query) X2 = tokenizer.texts_to_sequences(X1, maxlen) X = sequence.pad_sequences(X2, maxlen=maxlen) pred = model.predict_proba(X, batch_size=1) idx = np.argmax(pred[0]) print("Type=" + idx2type[idx]) print(pred)
print('Loading data... (Test)') (x_test, y_test_class) = deepctxt_util.load_sequence_raw_data_x_y('./data/test.cleaned.tsv') y_test = [] for class_name_array in y_test_class: y = [] for class_name in class_name_array: class_id = className2Id[class_name] y.append(class_id) y_test.append(y) y_test = encode_category_vector.one_hot_category(y_test, num_categories) print('Done') print('Converting data... (Test)') x_test = tokenizer.texts_to_sequences(x_test, maxlen) print('Done') print(len(x_test), 'y_test sequences') #print("Pad sequences (samples x time)") X_test = sequence.pad_sequences(x_test, maxlen=maxlen) Y_test = sequence.pad_sequences(y_test, maxlen=maxlen) #print('X_test shape:', X_test.shape) print('Load model...') file = open('./query_ner_birnn_lstm_glove_100.15b.json', 'rb') model_string = file.read() file.close() model = model_from_json(model_string) model.load_weights('./query_ner_birnn_lstm_glove_100.15b.h5')