Python bag_of_words示例，utils.bag_of_words Python示例

示例#1

0

显示文件

def main(C, K, a, b, binary):
    # load data
    train_iter, val_iter, test_iter, text_field, label_field = utils.load_SST()

    # initialize classifier
    alpha = a * np.ones(C)
    beta = b * np.ones(K)
    n_features = len(text_field.vocab)
    nb = NaiveBayesClassifier(alpha, beta, n_features)

    print("Training model...")
    for i, batch in enumerate(train_iter):
        X = utils.bag_of_words(batch, text_field).data.numpy()
        if binary:
            X = X > 0
        y = batch.label.data.numpy() - 1
        nb.fit(X, y)

    print("Testing model...")
    n, n_corr = 0, 0
    upload = []
    for i, batch in enumerate(val_iter):
        X = utils.bag_of_words(batch, text_field).data.numpy()
        if binary:
            X = X > 0
        y_pred = nb.predict(X)
        y = batch.label.data.numpy() - 1

        n += len(y)
        n_corr += sum(y_pred == y)

        upload += list(y_pred + 1)

    # write predictions to file
    print('Writing predictions to file...')
    with open("predictions.txt", "w") as f:
        f.write('Id,Cat\n')
        for i, u in enumerate(upload):
            f.write('{},{}\n'.format(i, u))

    return n_corr / n

示例#2

0

显示文件

文件： training_binary.py 项目： Ibukun12/py-nltk-dev

def load_samples(sample_list, tag, stemmer, max_words):
	data_set = []
	for (filename, category) in sample_list:
		# extract article words
		words = nltk.tokenize.wordpunct_tokenize(data.Article(filename).text)
		all_words = nltk.FreqDist(words)
		
		tokens = all_words.keys()
		if len(tokens) > max_words: # limit to max most frequent words per article
			tokens = tokens[:max_words]
		
		data_set.append((utils.bag_of_words(tokens, words, stemmer, True), tag))
	random.shuffle(data_set)
	return data_set

示例#3

0

显示文件

文件： training.py 项目： xujinhui2015/py-nltk-dev

def load_samples(sample_list, stemmer, max_words):
    data_set = []
    for (filename, category) in sample_list:
        # extract article words
        words = nltk.tokenize.wordpunct_tokenize(data.Article(filename).text)
        all_words = nltk.FreqDist(words)

        tokens = all_words.keys()
        if len(tokens
               ) > max_words:  # limit to max most frequent words per article
            tokens = tokens[:max_words]

        data_set.append((utils.bag_of_words(tokens, words, stemmer), category))
    random.shuffle(data_set)
    return data_set

示例#4

0

显示文件

    theta = np.zeros([action_dim, state_dim])

    single_run_epoch_rewards_test = []
    pbar = tqdm(range(NUM_EPOCHS), ncols=80)
    for _ in pbar:
        single_run_epoch_rewards_test.append(run_epoch())
        pbar.set_description(
            "Avg reward: {:0.6f} | Ewma reward: {:0.6f}".format(
                np.mean(single_run_epoch_rewards_test),
                utils.ewma(single_run_epoch_rewards_test)))
    return single_run_epoch_rewards_test


if __name__ == '__main__':
    state_texts = utils.load_data('game.tsv')
    dictionary = utils.bag_of_words(state_texts)
    state_dim = len(dictionary)
    action_dim = NUM_ACTIONS * NUM_OBJECTS

    # set up the game
    framework.load_game_data()

    epoch_rewards_test = []  # shape NUM_RUNS * NUM_EPOCHS

    for _ in range(NUM_RUNS):
        epoch_rewards_test.append(run())

    epoch_rewards_test = np.array(epoch_rewards_test)

    x = np.arange(NUM_EPOCHS)
    fig, axis = plt.subplots()

示例#5

0

显示文件

文件： main.py 项目： MandilKarki/chatbot_pytorch

model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Sam"
print("Let's chat! (type 'quit' to exit)")
while True:
    # sentence = "do you use credit cards?"
    sentence = input("You: ")
    if sentence == "quit":
        break

    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])

    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.75:
        for intent in intents['intents']:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")

示例#6

0

显示文件

    def update_graph(interval):

        # query tweets from the database
        df = get_tweet_data()
        # get the number of tweets for each keyword
        cnt = bag_of_words(df['text'])

        # get top-N words
        top_N = cnt.most_common(num_tags_scatter)
        top_N_words = [keyword for keyword, cnt in top_N]

        # preprocess the text column
        df['text'] = df.text.apply(preprocess_nltk)

        sentiments = {keyword: [] for keyword in top_N_words}

        for row in df['text']:
            # print(row)
            for keyword in top_N_words:
                # print(keyword)
                if keyword.lower() in row.lower():
                    # print(sid.polarity_scores(row)['compound'])
                    sentiments[keyword].append(
                        sid.polarity_scores(row)['compound'])

        avg_sentiments = {}

        for keyword, score_list in sentiments.items():
            avg_sentiments[keyword] = [np.mean(score_list), np.std(score_list)]

        # get the current time for x-axis
        time = datetime.datetime.now().strftime('%D, %H:%M:%S')
        X_universal.append(time)

        to_pop = []
        for keyword, score_queue in sentiment_dict.items():
            if score_queue:
                while score_queue and (score_queue[0][1] <= X_universal[0]):
                    score_queue.popleft()
            else:
                to_pop.append(keyword)

        for keyword in to_pop:
            sentiment_dict.pop(keyword)

        for keyword, score in avg_sentiments.items():
            if keyword not in sentiment_dict:
                sentiment_dict[keyword] = deque(maxlen=30)
                sentiment_dict[keyword].append([score, time])
            else:
                sentiment_dict[keyword].append([score, time])

        new_colors = chart_colors[:len(sentiment_dict)]

        # plot the scatter plot
        data = [
            go.Scatter(x=[time for score, time in score_queue],
                       y=[score[0] for score, time in score_queue],
                       error_y={
                           "type": "data",
                           "array":
                           [score[1] / 30 for score, time in score_queue],
                           "thickness": 1.5,
                           "width": 1,
                           "color": "#000",
                       },
                       name=keyword,
                       mode='markers',
                       opacity=0.7,
                       marker=dict(color=color))
            for color, (
                keyword,
                score_queue) in list(zip(new_colors, sentiment_dict.items()))
        ]

        # specify the layout
        layout = go.Layout(
            xaxis={
                'automargin': False,
                'range': [min(X_universal), max(X_universal)],
                'title': 'Current Time (GMT)',
                'nticks': 2,
            },
            yaxis={
                'autorange': True,
                'title': 'Sentiment Score'
            },
            height=400,
            plot_bgcolor=app_color["graph_bg"],
            paper_bgcolor=app_color["graph_bg"],
            font={"color": app_color["graph_font"]},
            autosize=False,
            legend={
                'orientation': 'v',
                # 'xanchor': 'right',
                # 'yanchor': 'middle',
                # 'x': 0.5,
                # 'y': 1.025
            },
            margin=go.layout.Margin(l=75, r=25, b=70, t=25, pad=4),
        )

        return go.Figure(
            data=data,
            layout=layout,
        )

示例#7

0

显示文件

文件： chatbot.py 项目： Abd-MOUSSAOUI/Chatbot-printer

    if cc != "":
        cc = int(cc)
    else:
        cc = 0
    count = open('count.txt', 'r+')
    report = open("report.txt", "a")
    agenda = open("agenda.txt", "a")

    if (sentence == "quit" or sentence == "bye"):
        print("Printer-Bot: Au revoir :)")
        break

    date = datetime.datetime.now()

    sent = tokenize(sentence)
    X = bag_of_words(sent, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X)
    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]
    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() >= 0.7:
        name_of_doc = ""
        time = nbr_pages = 0
        for intent in intents['intents']:
            if tag == intent["tag"]:
                if tag == 'print':
                    for word in sent:

示例#8

0

显示文件

文件： train.py 项目： Abd-MOUSSAOUI/Chatbot-printer

# ignore some symbols
ignore_sym = ['?', '.', '!', ',', "'", '-']
all_words = [stem(wrd) for wrd in all_words if wrd not in ignore_sym]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

#print(len(x_y), "patterns")
#print(len(tags), "tags:", tags)
#print(len(all_words), "unique words:", all_words)

# Creating data-set
X_train = []
Y_train = []
for (ptrn_sent, tag) in x_y:
    bag = bag_of_words(ptrn_sent, all_words)
    X_train.append(bag)
    label = tags.index(tag)
    Y_train.append(label)

X_train = np.array(X_train)
Y_train = np.array(Y_train)

# Parameters
input_size = len(X_train[0])
output_size = len(tags)
batch_size = 8
hidden_size = 8
learning_rate = 0.001
epochs = 1000

示例#9

0

显示文件

文件： problem5b.py 项目： chase130101/cs281-f17

lam_vals = [0, 0.001, 0.01, 0.1, 1]
num_epochs = 15

for lam in lam_vals:
    model = build_model(V, num_labels)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    size_val_data = 0.0
    val_num_correct = 0.0

    for epoch in range(num_epochs):
        size_training_data = 0.0
        train_num_correct = 0.0
        loss = 0.0

        for batch in train_iter:
            x = Variable(bag_of_words(batch, text_field))
            y = batch.label - 1 # batch.label is 1/2, while we want 0/1

            batch_loss = train(model, lam, x, y, optimizer)
            loss += batch_loss

            batch_num_correct = np.sum(np.argmax(torch.exp(model.forward(x)).data.cpu().numpy(), axis = 1) == y.data.cpu().numpy())
            train_num_correct += batch_num_correct
            size_training_data += len(y)
        
        print('Epoch ' + str(epoch + 1))
        print('Lambda = ' + str(lam))
        print('Epoch train accuracy: ' + str(train_num_correct/size_training_data))
        print('Epoch train loss: ' + str(loss))
        print()

示例#10

0

显示文件

文件： train.py 项目： MandilKarki/chatbot_pytorch

ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))

print(len(xy), "patterns")
print(len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)

# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Hyper-parameters
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)

示例#11

0

显示文件

文件： main.py 项目： xujinhui2015/py-nltk-dev

def run(path):
	global fp

	# load article text
	article = data.Article(path)
	utils.load_data(article.text)

	fp = file("results.txt", "w")

	# show article text
	print_to_screen_and_file("-"*80)
	print_to_screen_and_file("Original article:\n")
	print_to_screen_and_file(article.text)
	print_to_screen_and_file("-"*80)
	
	print_to_screen_and_file("Categories:\n")
	top5 = pickle.load(open(config.TOP5_CATEGORIES, "r")); # list of: [catname, count, tag]
	print_to_screen_and_file("In article: " + str(article.cats))
	print_to_screen_and_file("Top5: " + str(top5))
	ground_truth = [tag for cat, count, tag in top5 if cat in article.cats]
	print_to_screen_and_file("Present from Top5: " + str(ground_truth))
	print_to_screen_and_file("-"*80)

	# make the summary & show in console
	print_to_screen_and_file("I Summary:\n")
	
	instance = SimpleSummarizer()
	# shorten the original article by one third
	print_to_screen_and_file(instance.summarize(article.text, len(utils.sentences) / 3))
	print_to_screen_and_file("-"*80)

	print_to_screen_and_file("II Summary:\n")
	print_to_screen_and_file(" ".join(ph_reduction.PhraseReductor().find(utils.tagged_sentences)))
	print_to_screen_and_file("-"*80)
	
	# classification
	print_to_screen_and_file("Multiclass classification:\n")
	stemmer = nltk.stem.WordNetLemmatizer()
	words = nltk.tokenize.wordpunct_tokenize(article.text)
	feats = utils.bag_of_words(words, article.text, stemmer)
	
	classifier = pickle.load(file(config.BAYES_CLASSIFIER_FILE, 'r'))
	b_class = classifier.classify(feats)
	print_to_screen_and_file("BayesClassifier class: " + b_class + ", is correct? " + str(b_class in ground_truth))
	
	classifier = pickle.load(file(config.MAXENT_CLASSIFIER_FILE, 'r'))
	m_class = classifier.classify(feats)
	print_to_screen_and_file("MaxEntClassifier class: " + m_class + ", is correct? " + str(m_class in ground_truth))
	
	classifier = pickle.load(file(config.DTREE_CLASSIFIER_FILE, 'r'))
	d_class = classifier.classify(feats)
	print_to_screen_and_file("DecisionTreeClassifier class: " + d_class + ", is correct? " + str(d_class in ground_truth))
	print_to_screen_and_file("-"*80)
	
	print_to_screen_and_file("Binary classification:\n")
	title = ["BayesClassifier: ", "MaxEntClassifier: ", "DecisionTreeClassifier: "]
	classifiers = [config.BAYES_CLASSIFIER_FILE_PATTERN, config.MAXENT_CLASSIFIER_FILE_PATTERN, config.DTREE_CLASSIFIER_FILE_PATTERN]
	tags = ["A", "B", "C", "D", "E", "OTHER"]
	for index, typename in enumerate(classifiers):
		results = {}
		accuracy = 0
		for tag in tags:
			fname = typename%(tag)
			classifier = pickle.load(file(fname, 'r'))
			results[tag] = classifier.classify(feats)
			if results[tag] == "yes":
				if (tag in ground_truth): accuracy += 1
			elif results[tag] == "no":
				if (tag not in ground_truth): accuracy += 1
			
		print_to_screen_and_file(title[index] + str(results)+", accuracy: " + str(accuracy*100/len(tags)) + "%")
	print_to_screen_and_file("-"*80)

	# people actions
	print_to_screen_and_file("People and their actions:\n")
	work = action.Actions().find(utils.tagged_words, utils.tagged_sentences, utils.people)
	# print the updated info with people actions
	for i, (key, value) in enumerate(work.items()):
		print_to_screen_and_file("[%d] - %s = %s"%(i+1, key, value))
	print_to_screen_and_file("-"*80)

	# anaphora
	print_to_screen_and_file("Anaphoras:\n")
	refs = references.References().find(utils.people, utils.sentences, utils.tagged_sentences)
	for ref, fullname, index in refs:
		print_to_screen_and_file("Sentence["+str(index+1)+"]: " + ref + " - "+ fullname)
	print_to_screen_and_file("-"*80)

	# interactions
	print_to_screen_and_file("People interactions:\n")
	inter = interactions.Interactor().find(refs, utils.tagged_sentences)
	for index, item in enumerate(inter):
		who, prp, what = item['who'], item['prp'], item['what']
		s = "["+str(index+1)+"]:"
		for i in xrange(len(who)):
			if prp[i] and who[i]: s += " " + who[i] + "(" + prp[i] + "), "
			elif prp[i]: s += prp[i] + ", "
			elif who[i]: s += " " + who[i] + ", "
		s += " - " + ", ".join(what)
		print_to_screen_and_file(s)

	print_to_screen_and_file("-"*80)
	print "Finished."

	fp.close()

示例#12

0

显示文件

文件： train.py 项目： zhoutianyi1/chitty-bot

    tags.append(tag)
    for pattern in intent['patterns']:
        processed = tokenize(pattern)
        all_words.extend(processed)
        xy.append((processed, tag))

ignored = [',', '.', '?', '!']
all_words = [stem(word) for word in all_words
             if word not in ignored]  # stem all the words
all_words = sorted(set(all_words))  # get rid of duplicates
tags = sorted(set(tags))  # get rid of duplicates

X_train = []
y_train = []
for (processed_sentence, tag) in xy:
    bag = bag_of_words(processed_sentence, all_words)
    X_train.append(bag)

    label = tags.index(tag)
    y_train.append(label)

# parameters
batch_size = 8
hidden_size = 8
output_size = len(tags)
input_size = len(X_train[0])
learning_rate = 0.001
num_epochs = 800

dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset,