def main(unused_argv): if REMOVE_PREVIOUS_MODEL: #remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) #prepare trainning and testing data df = pd.read_csv(DATA_SET_FILE, header=None) # # random shuffle df.sample(frac=1) train_df = df[0:TRAINING_DATA_SIZE] test_df = df.drop(train_df.index) # 2 - news description, 0 - class x_train = train_df[2] x_test = test_df[2] y_train = train_df[0] y_test = test_df[0] # tokenize sentences x_train = [word_tokenize(s) for s in x_train.tolist()] x_test = [word_tokenize(s) for s in x_test.tolist()] # stemming words x_train = stemWords(x_train) x_test = stemWords(x_test) # process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) LOGGER.debug('Total words: %d', n_words) # saving n_words and vocab_processor: # we need to use the same vocabulary processor # each word the same index # we also need to save n_words itself for news_cnn_model with open(VARS_FILE, 'wb') as f: # needs to be opened in binary mode pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # train and predict classifier.fit(x_train, y_train, steps=STEPS) # evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) LOGGER.info('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # 导入训练数据集和测试数据集, # 数据集的格式为:topic,title,description,source df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x代表news的title,y代表news的topic(类别) x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) # x_train经过fit_transform后转化为以下形式: # array([[ 1, 2, 3, ..., 0, 0, 0], # [ 1, 8, 9, ..., 0, 0, 0], # [ 17, 1, 18, ..., 0, 0, 0], # ..., # [2112, 2113, 1417, ..., 0, 0, 0], # [2120, 49, 2121, ..., 0, 0, 0], # [2123, 1895, 2124, ..., 0, 0, 0]]) # 每一条news的title由字符串序列转换为数字序列, # 400条news的title转变为400个数字序列 x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) # 总共有2127个unique words n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # 将unique words的总数以及vocab_processor存到文件中 with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # 构建模型 classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # 训练 classifier.fit(x_train, y_train, steps=STEPS) # 对测试集进行预测,评估泛化性能 y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: if os.path.exists(MODEL_OUTPUT_DIR): # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:700] # x - news title, y - class x_train = train_df[2] # y_train [1, entry amount in x_train] y_train = train_df[0] test_df = df.drop(train_df.index) x_test = test_df[2] y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) # fit_transform: vocab_processor recognize words in x_train # x_train [entry amount in x_train, MAX_DOCUMENT_LENGTH] x_train = np.array(list(vocab_processor.fit_transform(x_train))) # transform: vocab_processor only mark words in x_test only appeared in x_train # x_test [entry amount in x_test, MAX_DOCUMENT_LENGTH] x_test = np.array(list(vocab_processor.transform(x_test))) # unique words amount in x_train n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def main(unused_argv): print('hello') if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE) df['class'] = df['class'].map(CLASS_ENCODING) df['title'].fillna('Untitled', inplace=True) num_train = int(len(df) * TRAIN_TEST_SPLIT) train_df = df[0:num_train] test_df = df.drop(train_df.index) # x - news title, y - class # use only title to train the topic model x_train = train_df['title'] y_train = train_df['class'] x_test = test_df['title'] y_test = test_df['class'] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'wb') as f: # needs to be opened in binary mode. pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model # This is to create customized estimator classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] # x_train = train_df[1] + ". " + train_df[2] # x_train = x_train.astype(str) # y_train = train_df[0] # x_test = test_df[1] + ". " + test_df[2] # x_test = x_test.astype(str) # y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'wb') as f: # needs to be opened in binary mode. pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def loadModel(): global classifier classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_DIR) df = pd.read_csv('../data/labeled_news.csv', header=None) train_df = df[0:400] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) print 'Model update'
def loadModel(): global classifier classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_DIR) df = pd.read_csv('../data/labeled_news.csv', header=None) # We have to call evaluate or predict at least once to make the restored Estimator work. train_df = df[0:400] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) print("Model updated!")
def loadModel(): global classifier classifier = estimator.SKCompat(estimator.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_DIR )) df = pd.read_csv(CSV_FILE, header=None) train_df = df[0:1] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train)), dtype=int) y_train = np.array(train_df[0], dtype=int) classifier.score(x_train, y_train) print 'Model updated'
def loadModel(): global classifier classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_DIR) # Prepare training and testing df = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'labeled_news.csv'), header=None) train_df = df[0:1] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) print "Model updated."
def loadModel(): global classifier classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( number_classes, n_words, learning_rate), model_dir=MODEL_DIR) # Prepare training and testing df = pd.read_csv(DATA_FILE, header=None) # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. test_df = df[0:testing_index_end] x_test = test_df[1] x_test = np.array(list(vocab_processor.transform(x_test))) y_test = test_df[0] classifier.evaluate(x_test, y_test) print "Model updated." logging.info('news_topic_modeling: model updated')
def loadModel(): global classifier classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(number_classes, n_words, learning_rate), model_dir=MODEL_DIR) # Prepare training and testing df = pd.read_csv(DATA_FILE, header=None) # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. test_df = df[0:testing_index_end] x_test = test_df[1] x_test = np.array(list(vocab_processor.transform(x_test))) y_test = test_df[0] classifier.evaluate(x_test, y_test) print "Model updated." logging.info('news_topic_modeling: model updated')
def loadModel(): global classifier classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_DIR) # Prepare training and testing df = pd.read_csv('../data/labeled_news.csv', header=None) # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. train_df = df[0:400] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) print("Model update.")
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def loadModel(): global classifier classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_DIR) # Prepare training and testing df = pd.read_csv('../data/labeled_news.csv', header=None) # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. train_df = df[0:400] x_train = train_df[1] x_train = np.array(list(vocab_processor.transform(x_train))) y_train = train_df[0] classifier.evaluate(x_train, y_train) print "Model update."
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: print("Removing previous model...") shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x - title x_train = train_df[1] x_test = test_df[1] # y - classes y_train = train_df[0] y_test = test_df[0] vocab = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) # print(x_train) x_train = np.array(list(vocab.fit_transform(x_train))) x_test = np.array(list(vocab.transform(x_test))) n_words = len(vocab.vocabulary_) print("Total words: %d" % n_words) # save the vocabulary with open(VARS_FILE, 'wb') as f: pickle.dump(n_words, f) vocab.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model model = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # train model model.fit(x_train, y_train, steps=STEPS) # evaluate model y_predict = [p['class'] for p in model.predict(x_test, as_iterable=True)] model_score = metrics.accuracy_score(y_test, y_predict) print("Accuracy of the model: {0:f}".format(model_score))
def loadModel(): global classifier classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words), model_dir=MODEL_DIR) # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved. # We have to call evaluate or predict at least once to make the restored Estimator work. # the below is just a dummy evaluate df = pd.read_csv('../data/labeled_news1.csv') df['class'] = df['class'].map(CLASS_ENCODING) df['title'].fillna('Untitled', inplace = True) num_train = int(len(df) * TRAIN_TEST_SPLIT) train_df = df[0:num_train] x_train = train_df['title'] y_train = train_df['class'] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) classifier.evaluate(x_train, y_train) print("Model update.")
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] ''' ### brian 07/11/2017 news = pd.read_csv("../training_data/labeled_news_title2.csv") news['TITLE'] = news['TITLE'].apply(nltk.word_tokenize) news['DESCRIPTION'] = news['DESCRIPTION'].apply(nltk.word_tokenize) news['DESCRIPTION'] = news['DESCRIPTION'] + news['TITLE'] news['TEXT'] = [normalize_text(s) for s in news['DESCRIPTION']] # pull the data into vectors vectorizer = CountVectorizer() x = vectorizer.fit_transform(news['TEXT']) encoder = LabelEncoder() y = encoder.fit_transform(news['CATEGORY']) # split into train and test sets x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) nb = MultinomialNB() nb.fit(x_train, y_train) with open(MODEL_OUTPUT_FOLDER, 'wb') as f: pickle.dump(nb, f) print "save NB model" with open(MODEL_FEATURE_FOLDER, 'wb') as f1: pickle.dump(vectorizer.vocabulary_, f1) print "save Vocabulary feature" print "nb score: ", nb.score(x_test, y_test) #loaded_model = pickle.load(open(MODEL_OUTPUT_FOLDER, 'rb')) #print "load model score ",loaded_model.score(x_test, y_test) ''' # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:3300] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] x_train = x_train.str.replace('[^\x00-\x7F]', '') ##################################### ''' x_train = train_df[2] x_train = x_train.str.replace('[^\x00-\x7F]','') tokenizer = RegexpTokenizer(r"\w+") stemmer = PorterStemmer() #wnl = WordNetLemmatizer() for i in xrange(0,3000): x_train[i] = str(x_train[i]) x_train[i] = tokenizer.tokenize(x_train[i]) x_train[i] = list(word for word in x_train[i] if word not in stopwords.words('english')) x_train[i] = [stemmer.stem(word) for word in x_train[i]] #x_train[i] = [wnl.lemmatize(word) for word in x_train[i]] x_train[i] = " ".join(str(word) for word in x_train[i]) ''' ########################################################### y_train = np.array(train_df[0], dtype=int) x_test = test_df[1] y_test = np.array(test_df[0], dtype=int) # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = estimator.SKCompat( estimator.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR, config=learn.RunConfig(save_checkpoints_secs=10, save_summary_steps=10))) # Set up logging for predictions tensors_to_log = {"prob": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) # Train and predict classifier.fit(x_train, y_train, batch_size=BATCH, steps=STEPS, monitors=[logging_hook]) # Configure the accuracy metric metrics = { "accuracy": learn.MetricSpec(metric_fn=tf.metrics.accuracy, prediction_key="class") } # Evaluate the model eval_results = classifier.score(x=x_test, y=y_test, metrics=metrics)
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) df.sample(frac=1) stemmer = SnowballStemmer('english') for i in range(0, len(df)): print i df_line = str(df.loc[i, 1]) + ' ' + str(df.loc[i, 2]) + ' ' + str( df.loc[i, 3]) df_line = df_line.translate(None, string.punctuation) df_line = df_line.lower() df_tokens = word_tokenize(df_line) df_filtered_tokens = [] for word in df_tokens: if word not in stopwords.words('english'): df_filtered_tokens.append(stemmer.stem(word.decode('utf-8'))) df_filtered_line = ' '.join(df_filtered_tokens) #print df_filtered_line df.loc[i, 1] = df_filtered_line #df.loc[i, 2] = df.loc[i, 3] train_df = df[0:training_index_end] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length, min_frequency, None, None) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( number_classes, n_words, learning_rate), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=steps) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) train_df = df[0:400] test_df = df.drop(train_df.index) # x - news description, y - class x_train = train_df[2] y_train = train_df[0] x_test = test_df[2] y_test = test_df[0] # tokenize sentences x_train = [word_tokenize(sentence) for sentence in x_train.tolist()] x_test = [word_tokenize(sentence) for sentence in x_test.tolist()] # Stemming words. norm_x_train = [] norm_x_test = [] for tokens in x_train: stemmed_tokens = [ stemmer.stem(w.lower()) for w in tokens if not w in stop_words ] norm_sentence = ' '.join(stemmed_tokens) norm_x_train.append(norm_sentence) for tokens in x_test: stemmed_tokens = [ stemmer.stem(w.lower()) for w in tokens if not w in stop_words ] norm_sentence = ' '.join(stemmed_tokens) norm_x_test.append(norm_sentence) x_train = norm_x_train x_test = norm_x_test # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) LOGGER.debug('Total words: %d', n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'wb') as f: # needs to be opened in binary mode. pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=STEPS) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) LOGGER.info('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) df.sample(frac=1) stemmer = SnowballStemmer('english') for i in range(0, len(df)): print i df_line = str(df.loc[i, 1]) + ' ' + str(df.loc[i, 2]) + ' ' + str(df.loc[i, 3]) df_line = df_line.translate(None, string.punctuation) df_line = df_line.lower() df_tokens = word_tokenize(df_line) df_filtered_tokens = [] for word in df_tokens: if word not in stopwords.words('english'): df_filtered_tokens.append(stemmer.stem(word.decode('utf-8'))) df_filtered_line = ' '.join(df_filtered_tokens) #print df_filtered_line df.loc[i, 1] = df_filtered_line #df.loc[i, 2] = df.loc[i, 3] train_df = df[0:training_index_end] test_df = df.drop(train_df.index) # x - news title, y - class x_train = train_df[1] y_train = train_df[0] x_test = test_df[1] y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, min_frequency, None, None) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Build model classifier = learn.Estimator( model_fn=news_cnn_model.generate_cnn_model(number_classes, n_words, learning_rate), model_dir=MODEL_OUTPUT_DIR) # Train and predict classifier.fit(x_train, y_train, steps=steps) # Evaluate model y_predicted = [ p['class'] for p in classifier.predict(x_test, as_iterable=True) ] score = metrics.accuracy_score(y_test, y_predicted) print('Accuracy: {0:f}'.format(score))
def loopFunction(steps, docLength, iteration): if REMOVE_PREVIOUS_MODEL: if os.path.exists(MODEL_OUTPUT_DIR): # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) # x - news title, y - class # training data train_df = df[0:1100] x_train = train_df[1] # y_train [1, entry amount in x_train] y_train = train_df[0] # testing data test_df = df.drop(train_df.index) x_test = test_df[1] y_test = test_df[0] # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor(docLength) # fit_transform: vocab_processor recognize words in x_train # x_train [entry amount in x_train, MAX_DOCUMENT_LENGTH] x_train = np.array(list(vocab_processor.fit_transform(x_train))) # transform: vocab_processor only mark words in x_test only appeared in x_train # x_test [entry amount in x_test, MAX_DOCUMENT_LENGTH] x_test = np.array(list(vocab_processor.transform(x_test))) # unique words amount in x_train n_words = len(vocab_processor.vocabulary_) # print('Total words: %d' % n_words) # Saving n_words and vocab_processor: with open(VARS_FILE, 'w') as f: pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) # Set up logging for predictions tensors_to_log = {"opt": "softmax"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=20) # validation monitor, log the metrics # https://www.tensorflow.org/get_started/monitors validation_metrics = { "accuracy": MetricSpec(metric_fn=tf.contrib.metrics.streaming_accuracy, prediction_key=learn.PredictionKey.CLASSES), "precision": MetricSpec(metric_fn=tf.contrib.metrics.streaming_precision, prediction_key=learn.PredictionKey.CLASSES) } validation_monitor = tf.contrib.learn.monitors.ValidationMonitor( x_test, y_test, every_n_steps=11, metrics=validation_metrics) # Build model classifier = learn.SKCompat( learn.Estimator(model_fn=news_cnn_model.generate_cnn_model( N_CLASSES, n_words), model_dir=MODEL_OUTPUT_DIR, config=learn.RunConfig(save_checkpoints_secs=None, save_checkpoints_steps=10))) # Train classifier.fit(x_train, y_train, steps=steps, monitors=[validation_monitor]) # Evaluate model prediction = classifier.predict(x_test) y_predicted = prediction['classes'] score = metrics.accuracy_score(y_test, y_predicted) # with open('night_test.csv','ab') as f: # writer=csv.writer(f, delimiter=',') # writer.writerow([iteration, steps, docLength, score]) print('Accuracy: {0:f}'.format(score))
def main(unused_argv): if REMOVE_PREVIOUS_MODEL: # Remove old model shutil.rmtree(MODEL_OUTPUT_DIR) os.mkdir(MODEL_OUTPUT_DIR) # Prepare training and testing data df = pd.read_csv(DATA_SET_FILE, header=None) # df = df.sample(frac=1).reset_index(drop=True) train_df = df[0:1800] test_df = df.drop(train_df.index) # x - news title + source, y - class x_train = train_df[1] + ' ' + train_df[3].astype(str).map(str.strip) y_train = train_df[0] x_test = test_df[1] + ' ' + test_df[3].astype(str).map(str.strip) y_test = list(test_df[0]) # Process vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor( MAX_DOCUMENT_LENGTH) x_train = np.array(list(vocab_processor.fit_transform(x_train))) x_test = np.array(list(vocab_processor.transform(x_test))) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words) # Saving n_words and vocab_processor for serving: with open(VARS_FILE, 'wb') as f: # needs to be opened in binary mode. pickle.dump(n_words, f) vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE) ''' Version 2 ''' # Build model model_fn = news_cnn_model.generate_cnn_model(N_CLASSES, n_words) classifier = tf.estimator.Estimator(model_fn=model_fn) # Train train_input_fn = tf.estimator.inputs.numpy_input_fn( x={WORDS_FEATURE: x_train}, y=y_train, batch_size=BATCH_SIZE, num_epochs=None, shuffle=True) classifier.train(input_fn=train_input_fn, steps=STEPS) # Predict predict_input_fn = tf.estimator.inputs.numpy_input_fn( x={WORDS_FEATURE: x_test}, num_epochs=1, shuffle=False) y_predicted = list(classifier.predict(input_fn=predict_input_fn)) predicted_classes = [p['class'] for p in y_predicted] # Evaluate w/o 'sports' for i in range(30): print(predicted_classes[i]) count = 0 sports = 0 for i in range(len(predicted_classes)): if (y_test[i] == 6): sports += 1 continue if (predicted_classes[i] == y_test[i]): count += 1 total = len(predicted_classes) - sports print('Augmented Accuracy: {0:f}'.format(float(count) / total)) # Evaluate. test_input_fn = tf.estimator.inputs.numpy_input_fn( x={WORDS_FEATURE: x_test}, y=np.array(y_test), num_epochs=1, shuffle=False) scores = classifier.evaluate(input_fn=test_input_fn) print('Overall Accuracy: {0:f}'.format(scores['accuracy'])) # Export export_dir = classifier.export_savedmodel(MODEL_OUTPUT_DIR, serving_input_receiver_fn) print('Model exported to %s' % export_dir) with open(EXPORT_DIR_FILE, 'wb') as f: # needs to be opened in binary mode. pickle.dump(export_dir, f) ''' Version 1 ''' '''