def classify(self): # Classifies unknown forum posts if not self.fit: print("Fitting must be performed before classifying") return vectorizer = Vectorizer(self.dictionary.dictionary) input_file = input( "Enter the name of the .txt file containing the unknown posts (including file-ending: " ) try: with open(input_file, "r") as file: vectors = vectorizer.vectorize( self.preprocessor.preprocess(file)) except FileNotFoundError: if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\ == "m": return self.classify() return with open("result.txt", "w") as result_file: for line in self.classifier.classify(vectors): result_file.write((label_list[line] + "\n")) print( "Result saved in result.txt. " + "The predicted label of each post is printed on the corresponding line of the document." )
def preprocess_and_fit(self): # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier processed = [] processed_test = [] for category in self.categories: processed.append( self.preprocessor.preprocess('training' + str(category) + ".txt")) processed_test.append( self.preprocessor.preprocess('testing' + str(category) + ".txt")) # Word indexing for category in processed: # indexes all words into dictionary self.dictionary.index_words(category) print("Words indexed. Dictionary size: ", len(self.dictionary.dictionary), " words") # Vectorization vectorizer = Vectorizer( self.dictionary.dictionary ) # initializes vectorizer-object with dictionary vector_start = time.time() print("Vectorizing...") training_vectors = [] testing_vectors = [] for category in processed: training_vectors.append(vectorizer.vectorize(category)) for category in processed_test: testing_vectors.append(vectorizer.vectorize(category)) vector_time = time.time() - vector_start print("Vectorization completed in ", ("%.2f" % vector_time), "seconds") # Training and evaluation self.classifier.train(training_vectors) self.fit = True self.classifier.evaluate(testing_vectors)
for article, score in relevance_sorted_articles: print "Id: ", article.id_num print "Link: ", article.link print "Description: ", article.description print article.title ,":",score print '\n\n\n' sorted_article_list.append(article) return sorted_article_list #==================FINDING TRENDING ARTICLES================= trending_articles = findTrending(PICKELED_RECENT_ARTICLES_ALL_TOPICS) vectorizer = Vectorizer() vectorized_trending_articles = vectorizer.vectorize(trending_articles) setArticleVectors(trending_articles, vectorized_trending_articles) # for article in trending_articles: # print article.description, article.vector dimensions = str(len(vectorized_trending_articles)) + " x " + str(len(vectorized_trending_articles[0])) print "Term document matrix with" + dimensions + ": \n", vectorized_trending_articles #==================KMEANS STARTS HERE======================= # print "Calculating kmeans..." # kmeans_calculator = KMeansClusterer() # predicted_clustering = kmeans_calculator.calculateKMeans(vectorized_trending_articles, 30)
x, y, c = [], [], [] for token in tokens: if token in dictioanry.valences: x.append(np.random.rand()) y.append(np.random.rand()) c.append(rgb(np.mean(dictioanry.valences[token]))) plt.scatter(x, y, c=c, alpha=0.8) plt.show() if __name__ == "__main__": preprocessor = Preprocessor(['train.tsv', 'test.tsv'], Cruncher()) dictionary = Dictioanry('..\\..\\lexica') vectorizer = Vectorizer() labels, vectors = vectorizer.vectorize(preprocessor, dictionary) visualizer = Visualizer(preprocessor) for method in Visualizer.supported_methods: visualizer.visualize(method=method, dictionary=dictionary, model=vectorizer.underlying)
def evaluation(filenames, dictionary_root='../../lexica', cruncher_type='lemmatizer', vectorizer_type='word2vec', metrics=['f1-score', 'accuracy-score']): if not isinstance(filenames, list): raise ValueError("'" + filenames + "' is not an instance of 'list'") beg = time.time() vectorizer = Vectorizer(vectorizer_type) try: labels, vectors = vectorizer.vectorize(filenames, dictionary_root) except: preprocessor = Preprocessor(filenames, Cruncher(cruncher_type)) dictionary = Dictioanry(dictionary_root) if dictionary_root else None labels, vectors = vectorizer.vectorize(preprocessor, dictionary) test_ids, test_labels, test_vectors = [], [], [] train_ids, train_labels, train_vectors = [], [], [] for id, label in labels.items(): if label == 'unknown': test_ids.append(id) test_labels.append(label) test_vectors.append(vectors[id]) else: train_ids.append(id) train_labels.append(label) train_vectors.append(vectors[id]) evaluator = Evaluator() for classifing in ['knn', 'rrb', 'svm']: if classifing != 'rrb': classifier = Classifier(train_vectors, train_labels, classifing) predictions = classifier.predict(test_vectors) else: classifier = RoundRobin(train_labels, train_vectors, test_vectors) predictions = classifier.classify() for metric in metrics: value = evaluator.evaluate(dict(zip(test_ids, predictions)), metric) print('<LOG>: The performance of', "'" + classifing + "'", 'according to the', ("'" + metric + "'").ljust(max(map(len, metrics)) + 2), "metric is", '{0:.6f}'.format(value)) end = time.time() print('\n\nElapsed time:', '{0:.6f}'.format(end - beg), 'seconds', file=sys.stderr)
class MetaModel: """ We wrap the keras model in our own metaclass that handles text loading, and provides convenient train and sample functions. """ def __init__(self): self.train_model = None self.sample_model = None self.seeds = None self.vectorizer = None # Read in our data and validation texts def _load_data(self, data_dir, word_tokens, pristine_input, pristine_output, batch_size, seq_length, seq_step): try: with open(os.path.join(data_dir, 'input.txt')) as input_file: text = input_file.read() except FileNotFoundError: print_red("No input.txt in data_dir") sys.exit(1) skip_validate = True try: with open(os.path.join(data_dir, 'validate.txt')) as validate_file: text_val = validate_file.read() skip_validate = False except FileNotFoundError: pass # Validation text optional # Find some good default seed string in our source text. self.seeds = find_random_seeds(text) # Include our validation texts with our vectorizer all_text = text if skip_validate else '\n'.join([text, text_val]) self.vectorizer = Vectorizer(all_text, word_tokens, pristine_input, pristine_output) data = self.vectorizer.vectorize(text) x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step) print('x.shape:', x.shape) print('y.shape:', y.shape) if skip_validate: return x, y, None, None data_val = self.vectorizer.vectorize(text_val) x_val, y_val = shape_for_stateful_rnn(data_val, batch_size, seq_length, seq_step) print('x_val.shape:', x_val.shape) print('y_val.shape:', y_val.shape) return x, y, x_val, y_val # Builds the underlying keras model def _build_models(self, batch_size, embedding_size, rnn_size, num_layers): model = Sequential() model.add( Embedding(self.vectorizer.vocab_size, embedding_size, batch_input_shape=(batch_size, None))) for layer in range(num_layers): model.add(LSTM(rnn_size, stateful=True, return_sequences=True)) model.add(Dropout(0.2)) model.add( TimeDistributed( Dense(self.vectorizer.vocab_size, activation='softmax'))) # With sparse_categorical_crossentropy we can leave as labels as # integers instead of one-hot vectors model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.summary() # Keep a separate model with batch_size 1 for sampling self.train_model = model config = model.get_config() config['layers'][0]['config']['batch_input_shape'] = (1, None) self.sample_model = Sequential.from_config(config) self.sample_model.trainable = False def update_sample_model_weights(self): """Sync training and sampling model weights""" self.sample_model.set_weights(self.train_model.get_weights()) def train(self, data_dir, word_tokens, pristine_input, pristine_output, batch_size, seq_length, seq_step, embedding_size, rnn_size, num_layers, num_epochs, live_sample): """Train the model""" print_green('Loading data...') load_start = time.time() x, y, x_val, y_val = self._load_data(data_dir, word_tokens, pristine_input, pristine_output, batch_size, seq_length, seq_step) load_end = time.time() print_red('Data load time', load_end - load_start) print_green('Building model...') model_start = time.time() self._build_models(batch_size, embedding_size, rnn_size, num_layers) model_end = time.time() print_red('Model build time', model_end - model_start) print_green('Training...') train_start = time.time() validation_data = (x_val, y_val) if (x_val is not None) else None callbacks = [LiveSamplerCallback(self)] if live_sample else None self.train_model.fit(x, y, validation_data=validation_data, batch_size=batch_size, shuffle=False, epochs=num_epochs, verbose=1, callbacks=callbacks) self.update_sample_model_weights() train_end = time.time() print_red('Training time', train_end - train_start) def sample(self, seed=None, length=None, diversity=1.0): """Sample the model""" self.sample_model.reset_states() if length is None: length = 100 if self.vectorizer.word_tokens else 500 if seed is None: seed = random.choice(self.seeds) print('Using seed: ', end='') print_cyan(seed) print('-' * 50) preds = None seed_vector = self.vectorizer.vectorize(seed) # Feed in seed string print_cyan(seed, end=' ' if self.vectorizer.word_tokens else '') for char_index in np.nditer(seed_vector): preds = self.sample_model.predict(np.array([[char_index]]), verbose=0) sampled_indices = np.array([], dtype=np.int32) # Sample the model one token at a time for i in range(length): char_index = 0 if preds is not None: char_index = sample_preds(preds[0][0], diversity) sampled_indices = np.append(sampled_indices, char_index) preds = self.sample_model.predict(np.array([[char_index]]), verbose=0) sample = self.vectorizer.unvectorize(sampled_indices) print(sample) return sample # Don't pickle the keras models, better to save directly def __getstate__(self): state = self.__dict__.copy() del state['train_model'] del state['sample_model'] return state
def main(args): print('Settings:') print(str(args)[10:-1]) length = 136 print('Loading data...') if args.no_recalc: train_x, _, dev_x, _, _, test_x, _, _ = load_data( '../data_dir' ) # Makes use of both raw and preprocessed source codes. train_x2, _, dev_x2, _, _, test_x2, _, _ = load_data('../data_dir', preprocess=True) print("Extracting stylometric features...") vec = Vectorizer( 'lexical' ) # Runs the stylometry_vectorizer from the vectorizer.py file so characters can be grabbed # simultaneously. train_x = vec.vectorize(train_x, train_x2) # Vectorize all 3 subsets dev_x = vec.vectorize(dev_x, dev_x2) test_x = vec.vectorize(test_x, test_x2) del train_x2, dev_x2, test_x2 scaler = MinMaxScaler() # Rescale values between 0 and 1. print("Rescaling...") train_x = scaler.fit_transform(train_x) dev_x = scaler.transform(dev_x) test_x = scaler.transform(test_x) length = len(train_x[0]) print(length) trainmm = np.memmap('vectors/train.mm', dtype='float32', mode='w+', shape=(50000, length)) trainmm[:] = train_x[:] devmm = np.memmap('vectors/dev.mm', dtype='float32', mode='w+', shape=(25000, length)) devmm[:] = dev_x[:] testmm = np.memmap('vectors/test.mm', dtype='float32', mode='w+', shape=(25000, length)) testmm[:] = test_x[:] del trainmm, devmm, testmm, train_x, dev_x, test_x # Save and flush all vectors. print("Finished building vectors.") # Load data from file. train_y, dev_y, _ = load_all_labels('../data_dir') dev = np.array( np.memmap('vectors/dev.mm', dtype='float32', mode='r', shape=(25000, length))) test = np.array( np.memmap('vectors/test.mm', dtype='float32', mode='r', shape=(25000, length))) train = np.array( np.memmap('vectors/train.mm', dtype='float32', mode='r', shape=(50000, length))) # Model. callback_list = [ EarlyStopping(monitor='val_acc', patience=10), ModelCheckpoint(filepath='style_model.h5', monitor='val_acc', save_best_only=True), ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=5) ] model = Sequential() model.add(Dense(500, activation='relu', input_shape=(136, ))) model.add(Dropout(0.3)) model.add(Dense(500, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(1000, activation='softmax')) opt = RMSprop(learning_rate=0.001) model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['acc']) model.summary() model.fit(train, train_y, epochs=1000, batch_size=250, validation_data=(dev, dev_y), shuffle=True, callbacks=callback_list) model = load_model('style_model.h5') print(model.evaluate(dev, dev_y)) # Generate predictions. predict_vec = np.memmap('vectors/dev_style.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec[:] = model.predict(dev)[:] del predict_vec predict_vec2 = np.memmap('vectors/test_style.mm', dtype='float32', mode='w+', shape=(25000, 1000)) predict_vec2[:] = model.predict(test)[:] del predict_vec2
X_train, X_test, y_train, y_test = train_test_split(df.values[:, 0], df.values[:, 1].astype('int'), test_size=0.2, random_state=0) # Task 2 v = Vectorizer(docs=X_train) # Task 3-4 result = Parallel(n_jobs=4)( delayed(gs.grid_search)(X=X_train, y=y_train, f_num=f_num, v=v) for f_num in range(1, 6)) result = sorted(result, key=itemgetter(0), reverse=True) best_c = result[0][1] best_penalty = result[0][2] best_f_num = result[0][3] # Task 5 X_train_vct = v.vectorize(docs=X_train, f_num=best_f_num) X_test_vct = v.vectorize(docs=X_test, f_num=best_f_num) best_lr = LogisticRegression(C=best_c, penalty=best_penalty, solver='liblinear') best_lr.fit(X_train_vct, y_train) print('The accuracy is %.3f percent' % (accuracy_score(y_test, best_lr.predict(X_test_vct)) * 100)) print(best_lr)