def train_sentiment_capture(stopwords, save=False): """ STEP#1 :: Cluster topic with unsupervised classification X1 text ---> [@cluster] ----> (y1 group, X1 text) STEP#2 :: Combine topic, tags, and group to make feature vector X2 <-- [tags, y1, X1] Y2 <-- Sentiment score STEP#3 :: Train the classification (Y2,X2) -----> [@classification] ----> @model """ print(colored('==============================', 'cyan')) print(colored(' SENTIMENT TRAINING', 'cyan')) print() print(colored(' DIM : {0}'.format(args['dim']), 'cyan')) print(colored(' K : {0}'.format(args['kcluster']), 'cyan')) print(colored(' TAG : {0}'.format(args['tagdim']), 'cyan')) print(colored('==============================', 'cyan')) # STEP#1 #------------------------------------ # Vectorise the input topic (text only) mqx1 = rabbit.create('localhost', 'pantip-x1') topicHasher = texthasher.safe_load(TEXT_VECTORIZER_PATH, n_components=args['dim'], stop_words=stopwords, decomposition='SVD') hashMe = texthasher.hash(topicHasher, learn=True) print(colored('#STEP-1 started ...', 'cyan')) print('hasher : {0}'.format(topicHasher)) iterX = DP.pipe(rabbit.iter(mqx1, take_x1), dests=None, transform=hashMe, title='Vectorisation') rabbit.end(mqx1) vecX = [x for x in iterX] # Cluster the vectorised records with unsupervised clf contentClf = textcluster.safe_load(CONTENT_CLUSTER_PATH, n_labels=args['kcluster']) clusterMe = textcluster.classify(contentClf, learn=True) # Classification doesn't accept a generator, # So we need to roll the matrix out of the MQ clusters = DP.pipe([x for x in vecX], dests=None, transform=clusterMe, title='Clustering') print(colored('#STEP-1 finished ...', 'cyan')) # STEP#2 # --------------------------------------------- # Vectorise tags # Convert tags into a numeric vector tagHasher = taghasher.safe_load(TAG_HASHER_PATH, n_feature=args['tagdim']) mqx2 = rabbit.create('localhost', 'pantip-x2') hashtagMe = taghasher.hash(tagHasher, learn=True) vectags = DP.pipe([tag for tag in rabbit.iter(mqx2, take_tags)], dests=None, transform=hashtagMe, title='Tag Vectorising') rabbit.end(mqx2) # STEP#3 #---------------------------------------- # Join each of the component together # Assembly a training vector mqy = rabbit.create('localhost', 'pantip-x3') Y = [y for y in rabbit.iter(mqy, take_sentiment_score)] XS = zip( list(vectags), [[i] for i in clusters], # Make scalar a single-element vector list(vecX)) X = [list(a) + list(b) + list(c) for a, b, c in XS] rabbit.end(mqy) # Train! print(colored('Training process started...', 'cyan')) clf = cluster.safe_load(CLF_PATH) trainMe = cluster.analyze(clf, labels=Y) Y_ = trainMe(X) print(colored('[DONE]', 'yellow')) # Self-validation num_correct = len([1 for y, y0 in zip(Y_, Y) if y == y0]) predict_rate = 100 * float(num_correct) / float(len(Y)) print(colored('====== TRAINING LABELS =====', 'magenta')) print(Y) print(colored('========= PREDICTED ========', 'magenta')) print(list(Y_)) print(colored('=========== RESULTS ========', 'magenta')) print(' overall accuracy: {0:.2f} %'.format(predict_rate)) # Report accuracy by each of the labels labels = list(set(Y_)) lbl_predict_rate = [] for lbl in labels: samples = [(y, y0) for y, y0 in zip(Y_, Y) if y0 == lbl] num_correct = len([1 for y, y0 in samples if y == y0]) num_all = len(samples) accuracy = 100 * float(num_correct) / float(num_all) print( ' accuracy class #{0} : {1:.2f} % (out of {2} cases)'.format( lbl, accuracy, num_all)) lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7)) # Record the training accuracy to the CSV with open(CSV_REPORT_PATH, 'a') as csv: csv.write('{0},{1},{2},{3},{4}\n'.format( str(args['dim']).center(4), #0 str(args['kcluster']).center(3), #1, str(args['tagdim']).center(5), #2 '{0:.2f}'.format(predict_rate).center(7), #3 ','.join(lbl_predict_rate) #4 )) #Save the trained models if save: print(colored('Saving models...', 'cyan')) texthasher.save(topicHasher, TEXT_VECTORIZER_PATH) textcluster.save(contentClf, CONTENT_CLUSTER_PATH) taghasher.save(tagHasher, TAG_HASHER_PATH) cluster.save(clf, CLF_PATH) print(colored('[DONE]', 'green'))
def train_sentiment_capture(stopwords,save=False): """ STEP#1 :: Cluster topic with unsupervised classification X1 text ---> [@cluster] ----> (y1 group, X1 text) STEP#2 :: Combine topic, tags, and group to make feature vector X2 <-- [tags, y1, X1] Y2 <-- Sentiment score STEP#3 :: Train the classification (Y2,X2) -----> [@classification] ----> @model """ print(colored('==============================','cyan')) print(colored(' SENTIMENT TRAINING','cyan')) print() print(colored(' DIM : {0}'.format(args['dim']),'cyan')) print(colored(' K : {0}'.format(args['kcluster']),'cyan')) print(colored(' TAG : {0}'.format(args['tagdim']),'cyan')) print(colored('==============================','cyan')) # STEP#1 #------------------------------------ # Vectorise the input topic (text only) mqx1 = rabbit.create('localhost','pantip-x1') topicHasher = texthasher.safe_load( TEXT_VECTORIZER_PATH, n_components=args['dim'], stop_words=stopwords, decomposition='SVD' ) hashMe = texthasher.hash(topicHasher,learn=True) print(colored('#STEP-1 started ...','cyan')) print('hasher : {0}'.format(topicHasher)) iterX = DP.pipe( rabbit.iter(mqx1,take_x1), dests=None, transform=hashMe, title='Vectorisation' ) rabbit.end(mqx1) vecX = [x for x in iterX] # Cluster the vectorised records with unsupervised clf contentClf = textcluster.safe_load( CONTENT_CLUSTER_PATH, n_labels=args['kcluster'] ) clusterMe = textcluster.classify(contentClf,learn=True) # Classification doesn't accept a generator, # So we need to roll the matrix out of the MQ clusters = DP.pipe( [x for x in vecX], dests=None, transform=clusterMe, title='Clustering' ) print(colored('#STEP-1 finished ...','cyan')) # STEP#2 # --------------------------------------------- # Vectorise tags # Convert tags into a numeric vector tagHasher = taghasher.safe_load( TAG_HASHER_PATH, n_feature=args['tagdim'] ) mqx2 = rabbit.create('localhost','pantip-x2') hashtagMe = taghasher.hash(tagHasher,learn=True) vectags = DP.pipe( [tag for tag in rabbit.iter(mqx2,take_tags)], dests=None, transform=hashtagMe, title='Tag Vectorising' ) rabbit.end(mqx2) # STEP#3 #---------------------------------------- # Join each of the component together # Assembly a training vector mqy = rabbit.create('localhost','pantip-x3') Y = [y for y in rabbit.iter(mqy,take_sentiment_score)] XS = zip( list(vectags), [[i] for i in clusters], # Make scalar a single-element vector list(vecX) ) X = [list(a) + list(b) + list(c) for a,b,c in XS] rabbit.end(mqy) # Train! print(colored('Training process started...','cyan')) clf = cluster.safe_load(CLF_PATH) trainMe = cluster.analyze(clf,labels=Y) Y_ = trainMe(X) print(colored('[DONE]','yellow')) # Self-validation num_correct = len([1 for y,y0 in zip(Y_,Y) if y==y0]) predict_rate = 100*float(num_correct)/float(len(Y)) print(colored('====== TRAINING LABELS =====','magenta')) print(Y) print(colored('========= PREDICTED ========','magenta')) print(list(Y_)) print(colored('=========== RESULTS ========','magenta')) print(' overall accuracy: {0:.2f} %'.format(predict_rate)) # Report accuracy by each of the labels labels = list(set(Y_)) lbl_predict_rate = [] for lbl in labels: samples = [(y,y0) for y,y0 in zip(Y_,Y) if y0==lbl] num_correct = len([1 for y,y0 in samples if y==y0]) num_all = len(samples) accuracy = 100*float(num_correct)/float(num_all) print(' accuracy class #{0} : {1:.2f} % (out of {2} cases)'.format(lbl,accuracy,num_all)) lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7)) # Record the training accuracy to the CSV with open(CSV_REPORT_PATH,'a') as csv: csv.write('{0},{1},{2},{3},{4}\n'.format( str(args['dim']).center(4), #0 str(args['kcluster']).center(3), #1, str(args['tagdim']).center(5), #2 '{0:.2f}'.format(predict_rate).center(7), #3 ','.join(lbl_predict_rate) #4 )) #Save the trained models if save: print(colored('Saving models...','cyan')) texthasher.save(topicHasher,TEXT_VECTORIZER_PATH) textcluster.save(contentClf,CONTENT_CLUSTER_PATH) taghasher.save(tagHasher,TAG_HASHER_PATH) cluster.save(clf,CLF_PATH) print(colored('[DONE]','green'))
Source MQ requeue task @starcolon projects """ from pypipe import pipe as Pipe from pypipe.operations import rabbit import json if __name__ == '__main__': qsrc = rabbit.create('localhost','pantip-x0') qdst = [rabbit.create('localhost',q) for q in ['pantip-x1','pantip-x2','pantip-x3','pantip-x00']] # Requeue! print('Requeuing ...') for m in rabbit.iter(qsrc): rabbit.feed(qdst)(m) # Bye all queues! rabbit.end_multiple(qdst) rabbit.end(qsrc) # Transfer from temp MQ#00 to MQ#0 q00 = rabbit.create('localhost','pantip-x00') q0 = rabbit.create('localhost','pantip-x0') for m in rabbit.iter(q00): rabbit.feed([q0])(m) # Bye all queues! rabbit.end_multiple([q0,q00]) print('[DONE] All input queues are recycled.')
time.sleep(1) # These are MQs we'll push preprocessed records to qs = ['pantip-x1', 'pantip-x2', 'pantip-x3', 'pantip-x0'] mqs = [rabbit.create('localhost', q) for q in qs] # Prepare the processing pipeline (order matters) pipe = Pipe.new('preprocess', []) Pipe.push(pipe, preprocess.take) Pipe.push(pipe, rabbit.feed(mqs)) Pipe.push(pipe, wordbag.feed(bag)) Pipe.then(pipe, lambda out: print(colored('[DONE!]', 'cyan'))) # Iterate through each record and processing couch.each_do(db, process_with(pipe), limit=40000) # Disconnect from the MQs [rabbit.end(mq) for mq in mqs] # Waiting for the background services # and kill `em terminate_background_services(workers) # Report the collected word bag print(colored('[Word bag]', 'green')) words = sorted(bag.items(), key=lambda b: -b[1])[:50] pprint(words) # Print most recurring words to file with open(WORD_BAG_DIR, 'w+') as txt: txt.writelines([w[0] + "\n" for w in words])
# These are MQs we'll push preprocessed records to qs = ['pantip-x1','pantip-x2','pantip-x3','pantip-x0'] mqs = [rabbit.create('localhost',q) for q in qs] # Prepare the processing pipeline (order matters) pipe = Pipe.new('preprocess',[]) Pipe.push(pipe,preprocess.take) Pipe.push(pipe,rabbit.feed(mqs)) Pipe.push(pipe,wordbag.feed(bag)) Pipe.then(pipe,lambda out: print(colored('[DONE!]','cyan'))) # Iterate through each record and processing couch.each_do(db,process_with(pipe),limit=40000) # Disconnect from the MQs [rabbit.end(mq) for mq in mqs] # Waiting for the background services # and kill `em terminate_background_services(workers) # Report the collected word bag print(colored('[Word bag]','green')) words = sorted(bag.items(),key=lambda b: -b[1])[:50] pprint(words) # Print most recurring words to file with open(WORD_BAG_DIR,'w+') as txt: txt.writelines([w[0] + "\n" for w in words])
def train_sentiment_capture(stopwords, save=False): print(colored('==============================', 'cyan')) print(colored(' SENTIMENT TRAINING', 'cyan')) print() print( colored( ' DECOMPOSITION : {0} => {1} components'.format( args['decom'], args['n']), 'cyan')) print( colored(' DIMENSION OF FEATURE : {0}'.format(args['feat']), 'cyan')) print( colored(' MAX LENGTH OF TAG VECTOR : {0}'.format(args['tagdim']), 'cyan')) print(colored('==============================', 'cyan')) # STEP#1 : [text] => [numeric vectors] #------------------------------------ # Vectorise the input topic (text only) mqx1 = rabbit.create('localhost', 'pantip-x1') topicHasher = texthasher.safe_load(TEXT_VECTORIZER_PATH, stop_words=stopwords, decomposition=args['decom'], n_components=args['n']) hashMe = texthasher.hash(topicHasher, learn=True) print(colored('#STEP-1 started ...', 'cyan')) print('hasher : {0}'.format(topicHasher)) iterX = DP.pipe(rabbit.iter(mqx1, take_x1), dests=None, transform=hashMe, title='Vectorisation') rabbit.end(mqx1) vecX = [x for x in iterX] print(colored('#STEP-1 finished ...', 'cyan')) # STEP#2 : [tags] => [numeric vectors] # --------------------------------------------- # Vectorise tags # Convert tags into a numeric vector tagHasher = taghasher.safe_load(TAG_HASHER_PATH, n_feature=args['tagdim']) mqx2 = rabbit.create('localhost', 'pantip-x2') hashtagMe = taghasher.hash(tagHasher, learn=True) vectags = DP.pipe([tag for tag in rabbit.iter(mqx2, take_tags)], dests=None, transform=hashtagMe, title='Tag Vectorising') rabbit.end(mqx2) # STEP#3 : [X] = [vectorised text] : [vectorised tags] #---------------------------------------- # Join each of the component together # Assembly a training vector mqy = rabbit.create('localhost', 'pantip-x3') Y = [y for y in rabbit.iter(mqy, take_sentiment_score)] XS = zip(list(vectags), list(vecX)) X = [list(a) + list(b) for a, b in XS] rabbit.end(mqy) # Train! print(colored('Training process started...', 'cyan')) clf = cluster.safe_load(CLF_PATH, args['cluster'], args['feat']) trainMe = cluster.analyze(clf, labels=Y) (Yact, Ypred) = trainMe(X, test_ratio=0.33) print(colored('[DONE]', 'yellow')) # Cross validation num_correct_all = 0 # Report accuracy by each of the labels labels = list(set(Yact)) lbl_predict_rate = [] for lbl in labels: samples = [(y, y0) for y, y0 in zip(Ypred, Yact) if y0 == lbl] num_correct = len([1 for y, y0 in samples if y == y0]) num_all = len(samples) accuracy = 100 * float(num_correct) / float(num_all) num_correct_all += num_correct print( ' accuracy class #{0} : {1:.2f} % (out of {2} cases)'.format( lbl, accuracy, num_all)) lbl_predict_rate.append('{0:.2f}'.format(accuracy).center(7)) # Report overall performance predict_rate = 100 * float(num_correct_all) / float(len(Yact)) print(colored('=========== CV PERFORMANCE ========', 'magenta')) print(' overall accuracy: {0:.2f} %'.format(predict_rate)) # Record the training accuracy to the CSV with open(CSV_REPORT_PATH, 'a') as csv: csv.write('{0},{1},{2},{3},{4},{5},{6}\n'.format( str(args['cluster']).center(11), #0 str(args['decom']).center(7), #1 str(args['n']).center(5), #2 str(args['feat']).center(5), #3 str(args['tagdim']).center(5), #4 '{0:.2f}'.format(predict_rate).center(7), #5 ','.join(lbl_predict_rate) #6 )) #Save the trained models if save: print(colored('Saving models...', 'cyan')) taghasher.save(tagHasher, TAG_HASHER_PATH) cluster.save(clf, CLF_PATH) texthasher.save(topicHasher, TEXT_VECTORIZER_PATH) print(colored('[DONE]', 'green'))
from pypipe import pipe as Pipe from pypipe.operations import rabbit import json if __name__ == '__main__': qsrc = rabbit.create('localhost', 'pantip-x0') qdst = [ rabbit.create('localhost', q) for q in ['pantip-x1', 'pantip-x2', 'pantip-x3', 'pantip-x00'] ] # Requeue! print('Requeuing ...') for m in rabbit.iter(qsrc): rabbit.feed(qdst)(m) # Bye all queues! rabbit.end_multiple(qdst) rabbit.end(qsrc) # Transfer from temp MQ#00 to MQ#0 q00 = rabbit.create('localhost', 'pantip-x00') q0 = rabbit.create('localhost', 'pantip-x0') for m in rabbit.iter(q00): rabbit.feed([q0])(m) # Bye all queues! rabbit.end_multiple([q0, q00]) print('[DONE] All input queues are recycled.')