def main(C, K, a, b, binary): # load data train_iter, val_iter, test_iter, text_field, label_field = utils.load_SST() # initialize classifier alpha = a * np.ones(C) beta = b * np.ones(K) n_features = len(text_field.vocab) nb = NaiveBayesClassifier(alpha, beta, n_features) print("Training model...") for i, batch in enumerate(train_iter): X = utils.bag_of_words(batch, text_field).data.numpy() if binary: X = X > 0 y = batch.label.data.numpy() - 1 nb.fit(X, y) print("Testing model...") n, n_corr = 0, 0 upload = [] for i, batch in enumerate(val_iter): X = utils.bag_of_words(batch, text_field).data.numpy() if binary: X = X > 0 y_pred = nb.predict(X) y = batch.label.data.numpy() - 1 n += len(y) n_corr += sum(y_pred == y) upload += list(y_pred + 1) # write predictions to file print('Writing predictions to file...') with open("predictions.txt", "w") as f: f.write('Id,Cat\n') for i, u in enumerate(upload): f.write('{},{}\n'.format(i, u)) return n_corr / n
def load_samples(sample_list, tag, stemmer, max_words): data_set = [] for (filename, category) in sample_list: # extract article words words = nltk.tokenize.wordpunct_tokenize(data.Article(filename).text) all_words = nltk.FreqDist(words) tokens = all_words.keys() if len(tokens) > max_words: # limit to max most frequent words per article tokens = tokens[:max_words] data_set.append((utils.bag_of_words(tokens, words, stemmer, True), tag)) random.shuffle(data_set) return data_set
def load_samples(sample_list, stemmer, max_words): data_set = [] for (filename, category) in sample_list: # extract article words words = nltk.tokenize.wordpunct_tokenize(data.Article(filename).text) all_words = nltk.FreqDist(words) tokens = all_words.keys() if len(tokens ) > max_words: # limit to max most frequent words per article tokens = tokens[:max_words] data_set.append((utils.bag_of_words(tokens, words, stemmer), category)) random.shuffle(data_set) return data_set
theta = np.zeros([action_dim, state_dim]) single_run_epoch_rewards_test = [] pbar = tqdm(range(NUM_EPOCHS), ncols=80) for _ in pbar: single_run_epoch_rewards_test.append(run_epoch()) pbar.set_description( "Avg reward: {:0.6f} | Ewma reward: {:0.6f}".format( np.mean(single_run_epoch_rewards_test), utils.ewma(single_run_epoch_rewards_test))) return single_run_epoch_rewards_test if __name__ == '__main__': state_texts = utils.load_data('game.tsv') dictionary = utils.bag_of_words(state_texts) state_dim = len(dictionary) action_dim = NUM_ACTIONS * NUM_OBJECTS # set up the game framework.load_game_data() epoch_rewards_test = [] # shape NUM_RUNS * NUM_EPOCHS for _ in range(NUM_RUNS): epoch_rewards_test.append(run()) epoch_rewards_test = np.array(epoch_rewards_test) x = np.arange(NUM_EPOCHS) fig, axis = plt.subplots()
model_state = data["model_state"] model = NeuralNet(input_size, hidden_size, output_size).to(device) model.load_state_dict(model_state) model.eval() bot_name = "Sam" print("Let's chat! (type 'quit' to exit)") while True: # sentence = "do you use credit cards?" sentence = input("You: ") if sentence == "quit": break sentence = tokenize(sentence) X = bag_of_words(sentence, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.75: for intent in intents['intents']: if tag == intent["tag"]: print(f"{bot_name}: {random.choice(intent['responses'])}")
def update_graph(interval): # query tweets from the database df = get_tweet_data() # get the number of tweets for each keyword cnt = bag_of_words(df['text']) # get top-N words top_N = cnt.most_common(num_tags_scatter) top_N_words = [keyword for keyword, cnt in top_N] # preprocess the text column df['text'] = df.text.apply(preprocess_nltk) sentiments = {keyword: [] for keyword in top_N_words} for row in df['text']: # print(row) for keyword in top_N_words: # print(keyword) if keyword.lower() in row.lower(): # print(sid.polarity_scores(row)['compound']) sentiments[keyword].append( sid.polarity_scores(row)['compound']) avg_sentiments = {} for keyword, score_list in sentiments.items(): avg_sentiments[keyword] = [np.mean(score_list), np.std(score_list)] # get the current time for x-axis time = datetime.datetime.now().strftime('%D, %H:%M:%S') X_universal.append(time) to_pop = [] for keyword, score_queue in sentiment_dict.items(): if score_queue: while score_queue and (score_queue[0][1] <= X_universal[0]): score_queue.popleft() else: to_pop.append(keyword) for keyword in to_pop: sentiment_dict.pop(keyword) for keyword, score in avg_sentiments.items(): if keyword not in sentiment_dict: sentiment_dict[keyword] = deque(maxlen=30) sentiment_dict[keyword].append([score, time]) else: sentiment_dict[keyword].append([score, time]) new_colors = chart_colors[:len(sentiment_dict)] # plot the scatter plot data = [ go.Scatter(x=[time for score, time in score_queue], y=[score[0] for score, time in score_queue], error_y={ "type": "data", "array": [score[1] / 30 for score, time in score_queue], "thickness": 1.5, "width": 1, "color": "#000", }, name=keyword, mode='markers', opacity=0.7, marker=dict(color=color)) for color, ( keyword, score_queue) in list(zip(new_colors, sentiment_dict.items())) ] # specify the layout layout = go.Layout( xaxis={ 'automargin': False, 'range': [min(X_universal), max(X_universal)], 'title': 'Current Time (GMT)', 'nticks': 2, }, yaxis={ 'autorange': True, 'title': 'Sentiment Score' }, height=400, plot_bgcolor=app_color["graph_bg"], paper_bgcolor=app_color["graph_bg"], font={"color": app_color["graph_font"]}, autosize=False, legend={ 'orientation': 'v', # 'xanchor': 'right', # 'yanchor': 'middle', # 'x': 0.5, # 'y': 1.025 }, margin=go.layout.Margin(l=75, r=25, b=70, t=25, pad=4), ) return go.Figure( data=data, layout=layout, )
if cc != "": cc = int(cc) else: cc = 0 count = open('count.txt', 'r+') report = open("report.txt", "a") agenda = open("agenda.txt", "a") if (sentence == "quit" or sentence == "bye"): print("Printer-Bot: Au revoir :)") break date = datetime.datetime.now() sent = tokenize(sentence) X = bag_of_words(sent, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() >= 0.7: name_of_doc = "" time = nbr_pages = 0 for intent in intents['intents']: if tag == intent["tag"]: if tag == 'print': for word in sent:
# ignore some symbols ignore_sym = ['?', '.', '!', ',', "'", '-'] all_words = [stem(wrd) for wrd in all_words if wrd not in ignore_sym] # remove duplicates and sort all_words = sorted(set(all_words)) tags = sorted(set(tags)) #print(len(x_y), "patterns") #print(len(tags), "tags:", tags) #print(len(all_words), "unique words:", all_words) # Creating data-set X_train = [] Y_train = [] for (ptrn_sent, tag) in x_y: bag = bag_of_words(ptrn_sent, all_words) X_train.append(bag) label = tags.index(tag) Y_train.append(label) X_train = np.array(X_train) Y_train = np.array(Y_train) # Parameters input_size = len(X_train[0]) output_size = len(tags) batch_size = 8 hidden_size = 8 learning_rate = 0.001 epochs = 1000
lam_vals = [0, 0.001, 0.01, 0.1, 1] num_epochs = 15 for lam in lam_vals: model = build_model(V, num_labels) optimizer = torch.optim.SGD(model.parameters(), lr=0.01) size_val_data = 0.0 val_num_correct = 0.0 for epoch in range(num_epochs): size_training_data = 0.0 train_num_correct = 0.0 loss = 0.0 for batch in train_iter: x = Variable(bag_of_words(batch, text_field)) y = batch.label - 1 # batch.label is 1/2, while we want 0/1 batch_loss = train(model, lam, x, y, optimizer) loss += batch_loss batch_num_correct = np.sum(np.argmax(torch.exp(model.forward(x)).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy()) train_num_correct += batch_num_correct size_training_data += len(y) print('Epoch ' + str(epoch + 1)) print('Lambda = ' + str(lam)) print('Epoch train accuracy: ' + str(train_num_correct/size_training_data)) print('Epoch train loss: ' + str(loss)) print()
ignore_words = ['?', '.', '!'] all_words = [stem(w) for w in all_words if w not in ignore_words] # remove duplicates and sort all_words = sorted(set(all_words)) tags = sorted(set(tags)) print(len(xy), "patterns") print(len(tags), "tags:", tags) print(len(all_words), "unique stemmed words:", all_words) # create training data X_train = [] y_train = [] for (pattern_sentence, tag) in xy: # X: bag of words for each pattern_sentence bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot label = tags.index(tag) y_train.append(label) X_train = np.array(X_train) y_train = np.array(y_train) # Hyper-parameters num_epochs = 1000 batch_size = 8 learning_rate = 0.001 input_size = len(X_train[0]) hidden_size = 8 output_size = len(tags)
def run(path): global fp # load article text article = data.Article(path) utils.load_data(article.text) fp = file("results.txt", "w") # show article text print_to_screen_and_file("-"*80) print_to_screen_and_file("Original article:\n") print_to_screen_and_file(article.text) print_to_screen_and_file("-"*80) print_to_screen_and_file("Categories:\n") top5 = pickle.load(open(config.TOP5_CATEGORIES, "r")); # list of: [catname, count, tag] print_to_screen_and_file("In article: " + str(article.cats)) print_to_screen_and_file("Top5: " + str(top5)) ground_truth = [tag for cat, count, tag in top5 if cat in article.cats] print_to_screen_and_file("Present from Top5: " + str(ground_truth)) print_to_screen_and_file("-"*80) # make the summary & show in console print_to_screen_and_file("I Summary:\n") instance = SimpleSummarizer() # shorten the original article by one third print_to_screen_and_file(instance.summarize(article.text, len(utils.sentences) / 3)) print_to_screen_and_file("-"*80) print_to_screen_and_file("II Summary:\n") print_to_screen_and_file(" ".join(ph_reduction.PhraseReductor().find(utils.tagged_sentences))) print_to_screen_and_file("-"*80) # classification print_to_screen_and_file("Multiclass classification:\n") stemmer = nltk.stem.WordNetLemmatizer() words = nltk.tokenize.wordpunct_tokenize(article.text) feats = utils.bag_of_words(words, article.text, stemmer) classifier = pickle.load(file(config.BAYES_CLASSIFIER_FILE, 'r')) b_class = classifier.classify(feats) print_to_screen_and_file("BayesClassifier class: " + b_class + ", is correct? " + str(b_class in ground_truth)) classifier = pickle.load(file(config.MAXENT_CLASSIFIER_FILE, 'r')) m_class = classifier.classify(feats) print_to_screen_and_file("MaxEntClassifier class: " + m_class + ", is correct? " + str(m_class in ground_truth)) classifier = pickle.load(file(config.DTREE_CLASSIFIER_FILE, 'r')) d_class = classifier.classify(feats) print_to_screen_and_file("DecisionTreeClassifier class: " + d_class + ", is correct? " + str(d_class in ground_truth)) print_to_screen_and_file("-"*80) print_to_screen_and_file("Binary classification:\n") title = ["BayesClassifier: ", "MaxEntClassifier: ", "DecisionTreeClassifier: "] classifiers = [config.BAYES_CLASSIFIER_FILE_PATTERN, config.MAXENT_CLASSIFIER_FILE_PATTERN, config.DTREE_CLASSIFIER_FILE_PATTERN] tags = ["A", "B", "C", "D", "E", "OTHER"] for index, typename in enumerate(classifiers): results = {} accuracy = 0 for tag in tags: fname = typename%(tag) classifier = pickle.load(file(fname, 'r')) results[tag] = classifier.classify(feats) if results[tag] == "yes": if (tag in ground_truth): accuracy += 1 elif results[tag] == "no": if (tag not in ground_truth): accuracy += 1 print_to_screen_and_file(title[index] + str(results)+", accuracy: " + str(accuracy*100/len(tags)) + "%") print_to_screen_and_file("-"*80) # people actions print_to_screen_and_file("People and their actions:\n") work = action.Actions().find(utils.tagged_words, utils.tagged_sentences, utils.people) # print the updated info with people actions for i, (key, value) in enumerate(work.items()): print_to_screen_and_file("[%d] - %s = %s"%(i+1, key, value)) print_to_screen_and_file("-"*80) # anaphora print_to_screen_and_file("Anaphoras:\n") refs = references.References().find(utils.people, utils.sentences, utils.tagged_sentences) for ref, fullname, index in refs: print_to_screen_and_file("Sentence["+str(index+1)+"]: " + ref + " - "+ fullname) print_to_screen_and_file("-"*80) # interactions print_to_screen_and_file("People interactions:\n") inter = interactions.Interactor().find(refs, utils.tagged_sentences) for index, item in enumerate(inter): who, prp, what = item['who'], item['prp'], item['what'] s = "["+str(index+1)+"]:" for i in xrange(len(who)): if prp[i] and who[i]: s += " " + who[i] + "(" + prp[i] + "), " elif prp[i]: s += prp[i] + ", " elif who[i]: s += " " + who[i] + ", " s += " - " + ", ".join(what) print_to_screen_and_file(s) print_to_screen_and_file("-"*80) print "Finished." fp.close()
tags.append(tag) for pattern in intent['patterns']: processed = tokenize(pattern) all_words.extend(processed) xy.append((processed, tag)) ignored = [',', '.', '?', '!'] all_words = [stem(word) for word in all_words if word not in ignored] # stem all the words all_words = sorted(set(all_words)) # get rid of duplicates tags = sorted(set(tags)) # get rid of duplicates X_train = [] y_train = [] for (processed_sentence, tag) in xy: bag = bag_of_words(processed_sentence, all_words) X_train.append(bag) label = tags.index(tag) y_train.append(label) # parameters batch_size = 8 hidden_size = 8 output_size = len(tags) input_size = len(X_train[0]) learning_rate = 0.001 num_epochs = 800 dataset = ChatDataset() train_loader = DataLoader(dataset=dataset,