if __name__ == '__main__': print( "----------------------------------------------------------------------------------------------------------" ) print("Time now: " + str(datetime.now())) print(data.info()) print("Time now: " + str(datetime.now())) print(type(data['lemmatized text'])) print( "----------------------------------------------------------------------------------------------------------" ) print("Loading Model..., Time now: " + str(datetime.now())) hashv = hvect.fit_transform(data['lemmatized text'].values.astype('U')) tfdif = TfidfTransformer(use_idf=True).fit(hashv) x_tf = tfdif.transform(hashv) df_tfdif = pd.DataFrame(x_tf, columns=hvect.get_stop_words()) print(df_tfdif) print("Model Loaded..., Time now: " + str(datetime.now())) print( "----------------------------------------------------------------------------------------------------------" ) exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop") user = input("\nEnter Your Name: ") botname = "Watson" print("Hey " + user + "! I'm " + botname + ", a chatbot trained on random data!\n") while True: request = input(user + ': ') if request.lower() in exit_commands: print(botname, ': Bye, Have a nice day!')
from sklearn.linear_model.stochastic_gradient import SGDClassifier from sklearn.preprocessing import LabelBinarizer import nltk import re import itertools import string import pickle import csv import time #Define parser/tokenizer for feature extractor porter=nltk.PorterStemmer()#define my parser punc=string.punctuation + '\n'+ '0123456789' #all punc+whtspc+newline htmp=HashingVectorizer(stop_words='english') sw=htmp.get_stop_words() #so we prebuild a sw list for speed in parser regex = re.compile('[%s]' % re.escape(punc)) def myparserTwo(s): return [porter.stem(a) for a in regex.sub('',s).lower().split(' ') if a not in sw and len(a)>2] #Define a label indicator matrix creator (binarizer) def labelizer(y, classes): imap = dict((v, k) for k, v in enumerate(classes)) row=[] col=[] for i, lt in enumerate(y): for l in lt:
############################################################################### # Main # ---- # # Create the vectorizer and limit the number of features to a reasonable # maximum N_FEATURES = 2 ** 18 vectorizer = HashingVectorizer(decode_error='ignore', n_features=N_FEATURES, non_negative=True) tokenizer = vectorizer.build_tokenizer() preprocessor = vectorizer.build_preprocessor() stop_words = vectorizer.get_stop_words() def tokenize(text): return vectorizer._word_ngrams(tokenizer(preprocessor(vectorizer.decode(text))), stop_words) # Iterator over parsed Reuters SGML files. data_stream = stream_reuters_documents() # We learn a binary classification between the "acq" class and all the others. # "acq" was chosen as it is more or less evenly distributed in the Reuters # files. For other datasets, one should take care of creating a test set with # a realistic portion of positive instances. all_classes = np.array([0, 1]) positive_class = 'acq'