示例#1
0
def create_classifier():

    sample_matched = [
        "What's the difference between an ocean and a sea?",
        "What is the difference between weather and climate?",
        "What would you say about the difference between atoms and elements?",
        "How would you compare lacrosse to football?",
        "How would you compare NetSuite vs Intacct?",
        "How many km are between Iasi and Hawaii?",
        "Tell me the difference between Obama and Trump",
        "What is the difference between me and you?",
        "Elaborate on why Trump is better than Obama"
    ]

    sample_unmatched = nchat.xml_posts()[:800]

    features_matched = []
    features_unmatched = []

    for text in sample_matched:
        features = (feature_select(text), 'matched')
        features_matched.append(features)

    for item in sample_unmatched:
        text = item.text
        features = (feature_select(text), 'unmatched')
        features_unmatched.append(features)

    train_features = features_matched + features_unmatched
    return nltk.NaiveBayesClassifier.train(train_features)
示例#2
0
def build_model(h):
    global model
    model = Model(h)
    global history
    history = h
    global vocab
    for w in word_tokenize('\n'.join(h)):
        if w.lower() in vocab:
            vocab[w.lower()] += 1
        else:
            vocab[w.lower()] = 1
    # print(vocab)
    global named_entities
    try:
        with open('named.pickle', 'rb') as f:
            named_entities = pickle.load(f)
    except FileNotFoundError:
        h_tokens = [word_tokenize(s) for s in h]
        tagged = tagger.tag_sents(h_tokens)
        named_entities = [tagged[0][0]]
        for n_e in tagged:
            for i in range(1, len(n_e)):
                if n_e[i][1] == n_e[i - 1][1]:
                    named_entities[-1] = (named_entities[-1][0] + ' ' +
                                          n_e[i][0], n_e[i][1])
                else:
                    named_entities.append(n_e[i])
        # print(named_entities)
        with open('named.pickle', 'wb') as f:
            pickle.dump(named_entities, f)
    generate_greeting_classifier_nps()
    # print('finding greetings')
    # greeting_classified = {s: classify_greeting(s) for s in h[:100]}
    # print('found greetings')
    global hellos, byes
    # hellos = {s: greeting_classified[s] for s in greeting_classified if greeting_classified[s] == 'Greet'}
    # byes = {s: greeting_classified[s] for s in greeting_classified if greeting_classified[s] == 'Bye'}
    hellos = {
        s.text: s.get('class')
        for s in nps_chat.xml_posts() if s.get('class') == 'Greet'
    }
    byes = {
        s.text: s.get('class')
        for s in nps_chat.xml_posts() if s.get('class') == 'Bye'
    }
    print('ready')
示例#3
0
 def _setQuestionWorld(self):
     self.dictionary = PyDictionary()
     posts = nchat.xml_posts()[:10000]
     featuresets = [(self.dialogue_act_features(post.text),
                     post.get('class')) for post in posts]
     size = int(len(featuresets) * 0.1)
     train_set, test_set = featuresets[size:], featuresets[:size]
     self.classifier = nltk.NaiveBayesClassifier.train(train_set)
     self.classifier.labels()
示例#4
0
def get_nltk_nps_corpus():
    posts = []

    for xml_file in nps_chat.xml_posts():
        post_one = []
        for post in xml_file:
            t = post[0]
            at = t.attrib
            post_one.append([at['word'], at['pos']])
        posts.append(post_one)

    return posts
示例#5
0
def generate_greeting_classifier_nps():
    global greeting_classifier
    try:
        with open('greet_classifier.pickle', 'rb') as f:
            greeting_classifier = pickle.load(f)
    except FileNotFoundError:
        v = set([w.lower() for w in nps_chat.words()])
        posts = nps_chat.xml_posts()[:5000]
        h = [
            (sentence_features(s.text.lower(), v=v),
             s.get('class') if s.get('class') in ['Greet', 'Bye'] else 'Other')
            for s in posts
        ]
        generate_greeting_classifier(h)
        with open('greet_classifier.pickle', 'wb') as f:
            pickle.dump(greeting_classifier, f)
示例#6
0
	def build_informal_set(self):
		minlines = 6
		maxlines = 25
		labeled_sets = []
		xml_posts = nps_chat.xml_posts()
		lines = 0
		goal = random.randint(minlines, maxlines)
		builder = ""
		for msg in xml_posts:
			if ".ACTION" not in msg.text:
				builder = builder+" "+msg.text.strip()
				lines += 1
			if lines > goal:
				labeled_sets.append((self.extract_features(builder), self.informal_label))
				goal = random.randint(minlines, maxlines)
				builder = ""
				lines = 0
		return labeled_sets
示例#7
0
 def build_informal_set(self):
     minlines = 6
     maxlines = 25
     labeled_sets = []
     xml_posts = nps_chat.xml_posts()
     lines = 0
     goal = random.randint(minlines, maxlines)
     builder = ""
     for msg in xml_posts:
         if ".ACTION" not in msg.text:
             builder = builder + " " + msg.text.strip()
             lines += 1
         if lines > goal:
             labeled_sets.append(
                 (self.extract_features(builder), self.informal_label))
             goal = random.randint(minlines, maxlines)
             builder = ""
             lines = 0
     return labeled_sets
示例#8
0
def calculate_confidence_index():
    cfd = nltk.ConditionalFreqDist((target, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.xml_posts(fileid)
        for target in ['ynQuestion']
        if (posts.get('class') == 'ynQuestion'))
    cfd.plot()


    # if(flagCount != 0 && timeElapsed != 0)
    # {

    
    # }else{

    
    # }
    print("Printing confidence index as a function"
        "of flagCount and timeElapsed")
示例#9
0
def classify_text():
    posts = nps_chat.xml_posts()[:10000]
    featuresets = [(dialogue_act_features(post.text), post.get('class'))
                   for post in posts]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    #print(nltk.classify.accuracy(classifier, test_set))
    print(
        classifier.classify(
            dialogue_act_features("how are you, my sweet cat?")))
    print(classifier.classify(dialogue_act_features("Shut up and get out")))
    print(
        classifier.classify(
            dialogue_act_features(
                "You are the most wonderful thing in my life")))
    print(classifier.classify(dialogue_act_features("I loved the movie")))
    print(
        classifier.classify(
            dialogue_act_features("How many suns does Jupitor have")))
    print(classifier.classify(dialogue_act_features("Do you love me?")))
示例#10
0
def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data
示例#11
0
import sys

import numpy
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
import nltk.corpus
from nltk import decorators
import nltk.stem
from nltk.corpus import nps_chat
posts = nps_chat.xml_posts()

stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
stopwords = set(nltk.corpus.stopwords.words('english'))


@decorators.memoize
def normalize_word(word):
    return stemmer_func(word.lower())


def get_words(titles):
    words = set()
    for title in job_titles:
        for word in title.split():
            words.add(normalize_word(word))
    return list(words)


@decorators.memoize
def vectorspaced(title):
    title_components = [normalize_word(word) for word in title.split()]
    return numpy.array(
#
#
#
#

import nltk
import random
from nltk.corpus import nps_chat
from nltk.corpus import stopwords
import pickle

stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.add('...')

xml_posts_0 = nps_chat.xml_posts()
posts_0 = nps_chat.posts()

categorized_posts = []
index = 0

# Categorize 'Accept' and 'Non-accept' posts
for el in xml_posts_0:
    if el.attrib.get('class') == 'yAnswer':
        categorized_posts.append((posts_0[index], 'Yes'))
    elif el.attrib.get('class') == 'nAnswer':
        categorized_posts.append((posts_0[index], 'No'))
    index += 1

all_words = []
for (post, category) in categorized_posts:
示例#13
0
    def make_sentece(self, augment={}, threshold=0.4):
        sent = ['__BEGIN__', '__BEGIN__']
        while sent[-1] != '__END__':
            state = sent[-self.state_size:]
            counts = [a[1] for a in self.model if a[0] == state][0]
            if not counts:
                counts = [a[1] for a in self.model if a[0] == state][0]
            augments = {
                a: augment[a] if augment[a] >= threshold else 0
                for a in augment
            }
            weights = {
                a: counts[a] * (augments[a]) if a in augments else counts[a]
                for a in counts
            }
            total = sum(weights.values())
            if total == 0:
                weights = {a: 1 for a in weights}
                total = sum(weights.values())
            probs = [a / total for a in weights.values()]
            draw = numpy.random.choice(list(counts.keys()), 1, p=probs)[0]
            sent.append(draw)
        detokenizer = MosesDetokenizer()
        return detokenizer.detokenize(sent[2:-1])


if __name__ == '__main__':
    chain = AugmentedChain([a.text for a in nps_chat.xml_posts()][:5000])
    print(chain.model)
    print(chain.make_sentece())
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import nps_chat
import numpy as np
from nltk.data import load
from nltk.corpus import stopwords

posts = nltk.corpus.nps_chat.xml_posts()
label_names = np.array(
    sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys()))

chatroom = nps_chat.xml_posts()

features = []
stop_words = set(stopwords.words('english'))
stop_words.update(
    ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
for item in chatroom:
    text = nltk.word_tokenize(item.text)
    sent = (list(nltk.pos_tag(text)))
    if item.get('class') == 'whQuestion' or item.get('class') == 'ynQuestion':
        for x in sent:
            if x not in stop_words:
                features.append(x)
features = nltk.FreqDist(features)
print(features.most_common(15))
示例#15
0
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import nps_chat as nps
import os


# twitterSamples = nltk.corpus.twitter_samples
# negTweets = twitter_samples.strings('negative_tweets.json')

teenChat = nps.xml_posts("11-08-teens_706posts.xml")
chatWords = nps.words("11-08-teens_706posts.xml")
chatBigrams = nltk.bigrams(chatWords)
cfd = nltk.ConditionalFreqDist(chatBigrams)
maxConfidence = 100
flagFile = open('flagList.txt')
flagList = flagFile.read()




def calculate_flags():
    flagNumber = 0
    tokens = nltk.word_tokenize(flagList)

    # TODO: using a list of flags to be determined,
    # iterate through posts to find instances of any flags
    cfd = nltk.ConditionalFreqDist((tokens, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.words(fileid)
        for target in [tokens]
        #you need a check if len(samples) < 1
示例#16
0
except FileNotFoundError:
    categorized_sentences = []

# load up categorized sentences if found
try:
    f = open('sentence_clusters.pickle', 'rb')
    sentence_clusters= pickle.load(f)
    f.close()
except FileNotFoundError:
    sentence_clusters = []


# preprocessing nps chat corpus for sentence classification
all_words = nltk.FreqDist(w.lower() for w in nps_chat.words())
word_features = [a[0] for a in all_words.most_common()[:2000]]
sentences = [(nltk.word_tokenize(a.text.lower()), a.attrib['class']) for a in nps_chat.xml_posts()]

# logical response types for each input sentence type
response_types = {
    'Accept':       ['Statement', 'Emotion', 'Emphasis'],
    'Bye':          ['Bye'],
    'Clarify':      ['Accept', 'Reject', 'Statement', 'Emphasis'],
    'Emotion':      ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
    'Continuer':    ['Accept', 'Reject', 'Statement', 'Emphasis'],
    'Emphasis':     ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
    'Greet':        ['Greet'],
    'Other':        ['Statement'],
    'Reject':       ['Statement', 'Emotion', 'Emphasis'],
    'Statement':    ['Accept', 'Reject', 'Statement', 'Emotion', 'Emphasis'],
    'System':       ['Statement'],
    'nAnswer':      ['Statement', 'Emotion', 'Emphasis'],
示例#17
0
@author: Kamalakanta
"""

import db_operations as dbo
from nltk.corpus import nps_chat as nc
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split as tst
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import pickle

posts = nc.xml_posts()

categories = [
    'accept', 'statement', 'yanswer', 'clarify', 'nanswer', 'reject', 'bye',
    'greet', 'whquestion', 'ynquestion', 'command'
]

categories_dict = {}
for i in range(len(categories)):
    if (categories[i] in {'accept', 'statement', 'yanswer', 'clarify'}):
        categories_dict[categories[i]] = 0
    elif (categories[i] in {'nanswer', 'reject'}):
        categories_dict[categories[i]] = 1
    elif (categories[i] == 'bye'):
        categories_dict[categories[i]] = 2
    elif (categories[i] == 'greet'):