class IssueClassification(object): """ Init for complain classification """ def __init__(self): # self.gm_worker = gearman.GearmanWorker(['localhost:4730']) # self.gm_worker.register_task('test_svm_rumour_classifier', self.testClassifier) self.root_dir = os.getcwd() self.trainClassifier() """ Function to fetch the data from cache @cache <dict> consist of training data """ def fetch_data(self, cache, data_home=None, subset='train', categories=None, shuffle=True, random_state=42): if subset in ('train', 'test'): data = cache[subset] else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data """ For custom tokenizing the text, removed stop words from text @text <type 'str'> text which needs to get tokenized @return <type 'str'> tokens """ def token_ques(self, text): things_to_replace = ['?'] things_to_replace += stopwords.words('english') #wh_word = None for tok in text.split('\n'): original_query = tok # 1. Stemming # 2. POS consideration verb, adjectives query_pos_tags = nltk.pos_tag(word_tokenize(tok)) for word in things_to_replace: tok = tok.lower() tok = tok.strip(" ") for word in word_tokenize(tok): yield word.lower() """ Train classifier """ def trainClassifier(self): try: t1 = time() start_time = time() self.hasher = FeatureHasher(input_type='string') self.clf = SVC(probability=True,C=5., gamma=0.001) data_folder = self.root_dir + "/training_data_issue" train_dataset = load_files(data_folder) cache = dict(train=train_dataset) self.data_train = self.fetch_data(cache, subset='train') try: self.clf = pickle.load(open("model_issue.pickle", "rb" ) ) except: import traceback print traceback.format_exc() print "Generating pickles" training_data = [] for text in self.data_train.data: text = text.decode('utf-8','ignore') training_data.append(text) raw_X = (self.token_ques(text) for text in training_data) #Type of raw_X <type 'generator'> X_train = self.hasher.fit_transform(raw_X) y_train = self.data_train.target self.clf.fit(X_train, y_train) readselfclf = open('model_issue.pickle', 'wb') pickle.dump(self.clf, readselfclf) readselfclf.close() print "Training ended" print("Classifier trained ...") print("time taken=>", time()-t1) except Exception: import traceback print traceback.format_exc() """ Function to test classifier """ def testClassifier(self, record): try: query = json.loads(record) # return json.dumps(lookup_result) query = query['complain'] result = {} test_data = [query] raw_X = (self.token_ques(text) for text in test_data) X_test = self.hasher.fit_transform(raw_X) pred = self.clf.predict(X_test) #print("pred=>", pred) self.categories = self.data_train.target_names index = 1 predict_prob = self.clf.predict_proba(X_test) for doc, category_list in zip(test_data, predict_prob): # print('\n\n') category_list = sorted(enumerate(category_list), key=lambda x:x[1], reverse=True) i = 0 for val in category_list: #print('%r => %s => %0.2f' % (doc, self.categories[val[0]], (float(val[1]) * 100))) result[self.categories[val[0]]] = val[1] * 100 except Exception: import traceback print traceback.format_exc() return result def testSingleRecord(self): while True: result = {} print "\n Enter description" desciption = raw_input() result['complain']= desciption rec_result = self.testClassifier(json.dumps( result ))
class ArticleClassification(object): """ Init for complain classification """ def __init__(self): # self.gm_worker = gearman.GearmanWorker(['localhost:4730']) # self.gm_worker.register_task('test_svm_rumour_classifier', self.testClassifier) self.root_dir = os.getcwd() self.trainClassifier() """ Function to fetch the data from cache @cache <dict> consist of training data """ def fetch_data(self, cache, data_home=None, subset='train', categories=None, shuffle=True, random_state=42): if subset in ('train', 'test'): data = cache[subset] else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) if shuffle: random_state = check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data """ For custom tokenizing the text, removed stop words from text @text <type 'str'> text which needs to get tokenized @return <type 'str'> tokens """ def token_ques(self, text): things_to_replace = ['?'] things_to_replace += stopwords.words('english') #wh_word = None for tok in text.split('\n'): original_query = tok # 1. Stemming # 2. POS consideration verb, adjectives query_pos_tags = nltk.pos_tag(word_tokenize(tok)) for word in things_to_replace: tok = tok.lower() tok = tok.strip(" ") for word in word_tokenize(tok): yield word.lower() """ Train classifier """ def trainClassifier(self): try: t1 = time() start_time = time() self.hasher = FeatureHasher(input_type='string', non_negative=True) self.clf = SVC(probability=True, C=5., gamma=0.001) data_folder = self.root_dir + "/training_data" train_dataset = load_files(data_folder) print("Time taken to load the data=>", time() - start_time) cache = dict(train=train_dataset) self.data_train = self.fetch_data(cache, subset='train') try: # print data_train.target_names print "Loading pickle" self.clf = pickle.load(open("model.pickle", "rb")) print "trained and ready ..." except: import traceback print traceback.format_exc() print "Generating pickles" training_data = [] for text in self.data_train.data: text = text.decode('utf-8', 'ignore') training_data.append(text) raw_X = (self.token_ques(text) for text in training_data ) #Type of raw_X <type 'generator'> X_train = self.hasher.fit_transform(raw_X) y_train = self.data_train.target self.clf.fit(X_train, y_train) readselfclf = open('model.pickle', 'wb') pickle.dump(self.clf, readselfclf) readselfclf.close() print "Training ended" print("Classifier trained ...") print("time taken=>", time() - t1) except Exception: import traceback print traceback.format_exc() """ Function to test classifier """ def testClassifier(self, record): try: query = json.loads(record) # return json.dumps(lookup_result) query = query['complain'] result = {} test_data = [query] raw_X = (self.token_ques(text) for text in test_data) X_test = self.hasher.fit_transform(raw_X) pred = self.clf.predict(X_test) #print("pred=>", pred) self.categories = self.data_train.target_names index = 1 predict_prob = self.clf.predict_proba(X_test) for doc, category_list in zip(test_data, predict_prob): # print('\n\n') category_list = sorted(enumerate(category_list), key=lambda x: x[1], reverse=True) i = 0 for val in category_list: #print('%r => %s => %0.2f' % (doc, self.categories[val[0]], (float(val[1]) * 100))) result[self.categories[val[0]]] = val[1] * 100 except Exception: import traceback print traceback.format_exc() print result print "process ends here" def readAndCall(self): import pdb # pdb.set_trace() t1 = time() start_time = time() data_file = open('June30_data.txt') input_data = data_file.read().split("\n") total = 0 sus_count = 0 sus_count_clone = 0 sus_count_dist = 0 sus_count_dup = 0 unsus_count = 0 processed_title = [] print "Started reading mongo data" for i in range(0, (len(input_data) - 1)): print i result = {} mydata = input_data[i].split("\t(") result['title'] = mydata[0] result['description'] = mydata[1] # completed_job_request = self.gm_client.submit_job('test_svm_rumour_classifier', json.dumps( result ) ,wait_until_complete=True) rec_result = self.testClassifier(json.dumps(result)) # print rec_result rec_result = json.loads(rec_result) total += 1 if rec_result['result']['suspected'] > rec_result['result'][ 'unsuspected']: print "suspected :", sus_count sus_count += 1 dup_flag = 0 for title in processed_title: if fuzz.ratio(rec_result['title'], title) > 80: sus_count_dup += 1 dup_flag = 1 print "duplicate : ", rec_result['title'] break # if dup_flag == 0: # sus_count_dist += 1 processed_title.append(rec_result['title']) else: unsus_count += 1 print "Processed title : ", processed_title print "Result for June 30" print "Total :", total print "Suspected : ", sus_count print "duplicates : ", sus_count_dup print "Unsuspected : ", unsus_count def testSingleRecord(self): while True: result = {} print "\n Enter description" desciption = raw_input() result['complain'] = desciption rec_result = self.testClassifier(json.dumps(result))