def splitfilebycategory(filename, combined_results): ''' Inputs: filename: file to split combined_results: output from performClassification in Jobs_Classifier { 'apple': [0,0,1,0...], 'twitter': [0,0,0,...], } Returns: { 'apple': [json_tweet, json_tweet], ... } ''' returnresult = {} svmstates = SVMStatesClassifier.objects.all() categorieslist = [] returnresult[NO_CATEGORY] = [] for svm in svmstates: categorieslist.append(svm.classifier_name) returnresult[svm.classifier_name] = [] with codecs.open(filename, encoding='cp1252') as f: for idx, line in enumerate(f): for category in categorieslist: if combined_results[category][idx] == POSITIVE: returnresult[category].append(line) returnresult[NO_CATEGORY].append(line) for cat, val in returnresult.iteritems(): debugPrint(cat, len(val)) return returnresult
def selectFeatureAndNegSamples(keyinfo, feat_type, feat_size, neg_sample_by_feat_size, debug_category=''): ''' High level function to select features of type and select negative samples using the selected features Params: keyinfo: output from getKeyInfoForClassifier feat_type: name of feature, from CS4242_Assg2.constants feat_size: number of features to select neg_sample_by_feat_size: number of negative samples to select debug_category (optional): category name for writing debug files Output: tuple [0]: [] of selected features [1]: [] of selected negative samples ''' pos_tweets = keyinfo[POSITIVE][PROCESSED_TWEETS] pos_sample_size = len(pos_tweets) neg_tweets = keyinfo[NEGATIVE][PROCESSED_TWEETS] neg_sample_size = len(neg_tweets) sel_feat = [] sel_neg_by_feat = [] if feat_type in keyinfo[UNIQUE_FEATURES] \ and feat_type in keyinfo[POSITIVE][FEATURES] \ and feat_type in keyinfo[NEGATIVE][FEATURES]: unique_feat = keyinfo[UNIQUE_FEATURES][feat_type] pos_feat = keyinfo[POSITIVE][FEATURES][feat_type] neg_feat = keyinfo[NEGATIVE][FEATURES][feat_type] chi2_feat = selectFeatureByChi2(unique_feat, pos_feat, neg_feat, pos_sample_size, neg_sample_size, feat_size) writeDebugListToFile( "%s_%s_chi2_sel_feat.txt" % (debug_category, feat_type), chi2_feat) sel_feat = [x[0] for x in chi2_feat] writeDebugListToFile( "%s_%s_chi2_sel_feat_only.txt" % (debug_category, feat_type), sel_feat) debugPrint("%s feature count (intial): %s" % (feat_type, len(pos_feat))) debugPrint("%s feature count (selected): %s" % (feat_type, len(sel_feat))) sel_neg_by_feat = selectTweetIfFeatureExists(neg_tweets, neg_sample_by_feat_size, sel_feat, feat_type) debugPrint("%s selected neg tweet count: %s" % (feat_type, len(sel_neg_by_feat))) else: debugPrint("%s not in use" % feat_type) return (sel_feat, sel_neg_by_feat)
def performSA(test_data_list, category=None): ''' Performs Sentimental Analysis on a given list of json tweets. Params: category: category in which input data belongs to. None if uncategorized Returns: { 'twitter': [[{TWEET_ID: id, TWEET_FULL: 'lalala', FEATURE_CREATED_AT: 'Sun Oct 16 22:28:08 +0000 2011', TWEET_USER_ID: 14883342}], [0,1,2],], 'microsoft':[] } ''' tweet_features_list = [] tweet_id_list = [] svmstates = SVMStatesSentimental.objects.all() debugPrint(">> Extracting features") for line in test_data_list: json_data = json.loads(line, encoding='cp1252') featureline = extractSentiFeaturesFromTweet(json_data) # {'tweet': This was a triumph, 'features': {FEATURE_TEXT: __ , 'geolocation' : __ }} # check if reply, combine features if nec tweet_features_list.append(featureline) tweet_id_list.append({ TWEET_ID: json_data['id_str'], TWEET_FULL: json_data['text'], FEATURE_CREATED_AT: json_data['created_at'], TWEET_USER_ID: json_data['user']['id'] }) # writeDebugListToFile("test_tweets_feature.txt", tweet_features_list) tweet_features_list_replyconcat = copy.deepcopy(tweet_features_list) # print tweet_features_list_replyconcat for featureline in tweet_features_list_replyconcat: if featureline[TWEET_FEATURES][FEATURE_SA_REPLY_TO_ID] != "": for fline2 in tweet_features_list_replyconcat: if fline2[TWEET_FEATURES][ FEATURE_SA_TWEETID_STR] == featureline[TWEET_FEATURES][ FEATURE_SA_REPLY_TO_ID]: # if ids match, featureline is a reply to fline2 for key, value in fline2[TWEET_FEATURES].iteritems(): if key != FEATURE_SA_TWEETID_STR and key != FEATURE_SA_REPLY_TO_ID and key != FEATURE_SA_CAPS_PERCENTAGE: for key2, value2 in value[FEATURE_VALUE].iteritems( ): if key2 not in featureline[TWEET_FEATURES][ key][FEATURE_VALUE]: featureline[TWEET_FEATURES][key][ FEATURE_VALUE][key2] = 0 featureline[TWEET_FEATURES][key][ FEATURE_VALUE][key2] += value2 # For each svm debugPrint(">> Classifying with SVM") combined_results = {} for svm in svmstates: if category is None or category == svm.classifier_name: featurematrix_classifier = svm.featurematrix features_enabled = svm.features_enabled debugPrint("Classifying for %s" % featurematrix_classifier.category) if FEATURE_SA_REPLIES in features_enabled: svm_matrix = getSVMMatrixForSA( featurematrix_classifier, features_enabled, tweet_features_list_replyconcat) else: svm_matrix = getSVMMatrixForSA(featurematrix_classifier, features_enabled, tweet_features_list) # writeDebugListToFile("%s_test_svm_matrix.txt" % featurematrix_classifier.category, svm_matrix[SVM_X]) debugPrint("Perform SVM Classification for %s" % svm.classifier_name) reslist = performSVMClassificationForSA(svm, svm_matrix) # for key, value in combined_results.iteritems(): # writeDebugListToFile("%s_results.txt" % key, value) # temporal info # print features_enabled if FEATURE_SA_TEMPORAL in features_enabled: # print "ti enabled" ti_dict = {} for idx, res in enumerate(reslist): user = tweet_id_list[idx][TWEET_USER_ID] created_at = tweet_id_list[idx][FEATURE_CREATED_AT] if user in ti_dict: # temporal info hit, update start = ti_dict[user][ 'last_tweet_time'] - datetime.timedelta( hours=TEMPORAL_INFO_TIMEFRAME_MINS) end = ti_dict[user][ 'last_tweet_time'] + datetime.timedelta( hours=TEMPORAL_INFO_TIMEFRAME_MINS) if start <= ti_dict[user]['last_tweet_time'] <= end: reslist[idx] = ti_dict[user]['sentiment'] ti_dict[user][ 'last_tweet_time'] = dateutil.parser.parse( created_at) else: # update and continue ti_dict[user] = { 'sentiment': res, 'last_tweet_time': dateutil.parser.parse(created_at) } combined_results[svm.classifier_name] = (tweet_id_list, reslist) # incomplete return combined_results
def performTrainingForSA(data_filename, label_filename, features_used=FEATURES_SA_DEFAULT, job_id=None): # extract & preprocess features try: debugPrint("feature extraction and preprocessing...") if job_id != None: connection.close() jobstatus = JobStatusSA.objects.get(id=job_id) gen = parseLabelFile(PATH_GROUNDTRUTH_TRAINING) categories_list = gen['categories'] groundtruth_list = gen['groundtruth_list'] all_keyinfo = getKeyInfoForSA( PATH_TRAINING_DATA, categories_list, groundtruth_list, features_used) # A test for unicode errors for category, keyinfo in all_keyinfo.iteritems(): debugPrint("training category: %s" % category) if job_id != None: updateJobStatus(jobstatus, "Training Category: %s" % category) pos_tweets = keyinfo[CLASS_SVM_POSITIVE][PROCESSED_TWEETS] neg_tweets = keyinfo[CLASS_SVM_NEGATIVE][PROCESSED_TWEETS] neu_tweets = keyinfo[CLASS_SVM_NEUTRAL][PROCESSED_TWEETS] # size = min(len(pos_tweets), len(neg_tweets), len(neu_tweets)) # max_size = 2*size # if max_size < 100: # max_size = 100 # print size, max_size # feature selection debugPrint(">> feature selection") # create feature matrix for each tweet debugPrint(">> get feature matrix") training_tweets = { CLASS_SVM_POSITIVE: pos_tweets, CLASS_SVM_NEGATIVE: neg_tweets, CLASS_SVM_NEUTRAL: neu_tweets } selected_feat_tweets = selectFeaturesForTraining( keyinfo, features_used) selected_feat = selected_feat_tweets[0] # training_tweets = selected_feat_tweets[1] # selected_feat = selectFeaturesForSA(keyinfo, training_tweets, features_used) writeDebugListToFile("%s_sa_selected_feat.txt" % category, selected_feat) writeDebugListToFile("%s_sa_pos_tweets.txt" % category, pos_tweets) writeDebugListToFile("%s_sa_neg_tweets.txt" % category, neg_tweets) writeDebugListToFile("%s_sa_neu_tweets.txt" % category, neu_tweets) feature_matrix = getFeatureMatrixForSA(category, training_tweets, selected_feat, features_used) debugPrint("feature count: %s" % len(feature_matrix.feature_to_id_map)) writeDebugCountDictToFile("%s_sa_feature_to_id_map.txt" % category, feature_matrix.feature_to_id_map) writeDebugListToFile( "%s_sa_tweet_feature_matrix_list.txt" % category, feature_matrix.tweet_feature_matrix_list) # create svm matrix debugPrint(">> create svm matrix") if job_id != None: updateJobStatus( jobstatus, "Creating SVM Matrix for category %s" % (category)) svm_matrix = getSVMMatrixForSA(feature_matrix, features_used) createSVMForSA(category, feature_matrix, svm_matrix, features_used) writeDebugListToFile("%s_sa_svm_matrix_X.txt" % category, svm_matrix[SVM_X]) writeDebugListToFile("%s_sa_svm_matrix_Y.txt" % category, svm_matrix[SVM_Y]) debugPrint("training completed for category: %s" % category) if job_id != None: updateJobStatus( jobstatus, "Training completed for category: %s" % (category)) # TODO: remove! # break if job_id != None: updateJobStatus(jobstatus, "Completed!") except: traceback.print_exc(file=open("%s/svmstates/errlog.txt" % (BASE_DIR), "a"))
def selectFeaturesForTraining(keyinfo, feat_used): all_features = keyinfo[UNIQUE_FEATURES] sample_size_set = { CLASS_SVM_POSITIVE: len(keyinfo[CLASS_SVM_POSITIVE][PROCESSED_TWEETS]), CLASS_SVM_NEUTRAL: len(keyinfo[CLASS_SVM_NEUTRAL][PROCESSED_TWEETS]), CLASS_SVM_NEGATIVE: len(keyinfo[CLASS_SVM_NEGATIVE][PROCESSED_TWEETS]) } tweet_set = { CLASS_SVM_POSITIVE: keyinfo[CLASS_SVM_POSITIVE][PROCESSED_TWEETS], CLASS_SVM_NEUTRAL: keyinfo[CLASS_SVM_NEUTRAL][PROCESSED_TWEETS], CLASS_SVM_NEGATIVE: keyinfo[CLASS_SVM_NEGATIVE][PROCESSED_TWEETS] } # print all_features selected_feat = [] training_tweets = { CLASS_SVM_POSITIVE: [], CLASS_SVM_NEUTRAL: [], CLASS_SVM_NEGATIVE: [] } size = min(len(tweet_set[CLASS_SVM_POSITIVE]), len(tweet_set[CLASS_SVM_NEGATIVE]), len(tweet_set[CLASS_SVM_NEUTRAL])) max_size = 2 * size if max_size < 100: max_size = 100 # print max_size for feat_type in feat_used: # TODO: whitelist if feat_type == FEATURE_SA_REPLIES or feat_type == FEATURE_SA_TEMPORAL: continue feature_set = { POLARITY_POSITIVE: keyinfo[CLASS_SVM_POSITIVE][FEATURES][feat_type], POLARITY_NEUTRAL: keyinfo[CLASS_SVM_NEUTRAL][FEATURES][feat_type], POLARITY_NEGATIVE: keyinfo[CLASS_SVM_NEGATIVE][FEATURES][feat_type] } if feat_type in all_features: # if feat_type == FEATURE_TEXT: # print FEATURE_TEXT # sel_feat_info = selectFeature(all_features[feat_type], feature_set, sample_size_set, 500) # sel_feat = sel_feat_info[0] # sel_tweet = selectTweetIfFeatureExistsSA(tweet_set, max_size, sel_feat_info[1], feat_type) # for class_svm in training_tweets: # training_tweets[class_svm] += sel_tweet[class_svm] # debugPrint("%s class svm tweet count: %s" % (class_svm, len(sel_tweet[class_svm]))) # # elif feat_type == FEATURE_HASHTAG: # print FEATURE_HASHTAG # sel_feat_info = selectFeature(all_features[feat_type], feature_set, sample_size_set, 200) # sel_feat = sel_feat_info[0] # sel_tweet = selectTweetIfFeatureExistsSA(tweet_set, max_size, sel_feat_info[1], feat_type) # for class_svm in training_tweets: # training_tweets[class_svm] += sel_tweet[class_svm] # debugPrint("%s class svm tweet count: %s" % (class_svm, len(sel_tweet[class_svm]))) # elif feat == FEATURE_SA_EMOTICONS: # sel_feat = selectFeature(all_features[feat], feature_set, sample_size_set, 10) # else: sel_feat = all_features[feat_type] selected_feat += sel_feat debugPrint("%s sa feature count: %s" % (feat_type, len(sel_feat))) # for class_svm in training_tweets: # class_tweets = training_tweets[class_svm] # training_tweets[class_svm] = {t[TWEET_FULL]:t for t in class_tweets}.values() # debugPrint("%s class svm tweet count: %s" % (class_svm, len(training_tweets[class_svm]))) return selected_feat, training_tweets
def getKeyInfoForSA(data_filename, categories_list, groundtruth_list, features_used): ''' Params: data_filename: input data file in json format categories_list: [cat1, cat2, cat3] groundtruth_list: [{CATEGORY: category, POLARITY: polarity, TWEET_ID: tweetid}] features: features list, defaults to FEATURES_SA_DEFAULT Returns: { 'category' : { CLASS_SVM_POSITIVE:{ PROCESSED_TWEETS : [{ TWEET_FULL: This was a triumph, TWEET_FEATURES: { FEATURE_TEXT: {FEATURE_COUNT: number, FEATURE_VALUE: {feature: count}} , } }], FEATURES: { FEATURE_TEXT: { feature: df } ... } }, CLASS_SVM_NEGATIVE:{ PROCESSED_TWEETS : [{ TWEET_FULL: This was a triumph, TWEET_FEATURES: { FEATURE_TEXT: {FEATURE_COUNT: number, FEATURE_VALUE: {feature: count}} , } }], FEATURES: { FEATURE_TEXT: { feature: df } ... } }, CLASS_SVM_NEUTRAL:{ PROCESSED_TWEETS : [{ TWEET_FULL: This was a triumph, TWEET_FEATURES: { FEATURE_TEXT: {FEATURE_COUNT: number, FEATURE_VALUE: {feature: count}} , } }], FEATURES: { FEATURE_TEXT: { feature: df } ... } }, UNIQUE_FEATURES: { FEATURE_TEXT: { feature: df } , ... } } } ''' returnmap = {} # Initialize category dictionaries categories_list.append(NO_CATEGORY) for category in categories_list: returnmap[category] = {} returnmap[category][CLASS_SVM_POSITIVE] = {} returnmap[category][CLASS_SVM_POSITIVE][PROCESSED_TWEETS] = [] returnmap[category][CLASS_SVM_NEGATIVE] = {} returnmap[category][CLASS_SVM_NEGATIVE][PROCESSED_TWEETS] = [] returnmap[category][CLASS_SVM_NEUTRAL] = {} returnmap[category][CLASS_SVM_NEUTRAL][PROCESSED_TWEETS] = [] with codecs.open(data_filename, encoding='cp1252') as k: debugPrint(">> extracting features from tweet") for idx, line in enumerate(k): # extract all features from tweet json_data = json.loads(line, encoding='cp1252') tweet_keyinfo = extractSentiFeaturesFromTweet( json_data, features_used) # print tweet_keyinfo # Classify into sentiment positive/negative/neutral gt_item = groundtruth_list[idx] if gt_item[POLARITY] == POLARITY_POSITIVE: returnmap[NO_CATEGORY][CLASS_SVM_POSITIVE][ PROCESSED_TWEETS].append(tweet_keyinfo) returnmap[gt_item[CATEGORY]][CLASS_SVM_POSITIVE][ PROCESSED_TWEETS].append(tweet_keyinfo) elif gt_item[POLARITY] == POLARITY_NEGATIVE: returnmap[NO_CATEGORY][CLASS_SVM_NEGATIVE][ PROCESSED_TWEETS].append(tweet_keyinfo) returnmap[gt_item[CATEGORY]][CLASS_SVM_NEGATIVE][ PROCESSED_TWEETS].append(tweet_keyinfo) elif gt_item[POLARITY] == POLARITY_NEUTRAL: returnmap[NO_CATEGORY][CLASS_SVM_NEUTRAL][ PROCESSED_TWEETS].append(tweet_keyinfo) returnmap[gt_item[CATEGORY]][CLASS_SVM_NEUTRAL][ PROCESSED_TWEETS].append(tweet_keyinfo) # collate unique features debugPrint(">> collating unique features...") for category in categories_list: debugPrint(">> collating for %s" % category) # unique_features_dict = initializeFeatureDict(features) unique_features_dict = {} for feature in features_used: # TODO: whitelist if feature == FEATURE_SA_REPLIES or feature == FEATURE_SA_TEMPORAL: continue unique_features_dict[feature] = {} pos_unique_features = getUniqueFeaturesForClass( returnmap[category][CLASS_SVM_POSITIVE][PROCESSED_TWEETS], unique_features_dict, features_used) returnmap[category][CLASS_SVM_POSITIVE][ FEATURES] = pos_unique_features neg_unique_features = getUniqueFeaturesForClass( returnmap[category][CLASS_SVM_NEGATIVE][PROCESSED_TWEETS], unique_features_dict, features_used) returnmap[category][CLASS_SVM_NEGATIVE][ FEATURES] = neg_unique_features neut_unique_features = getUniqueFeaturesForClass( returnmap[category][CLASS_SVM_NEUTRAL][PROCESSED_TWEETS], unique_features_dict, features_used) returnmap[category][CLASS_SVM_NEUTRAL][ FEATURES] = neut_unique_features # resolve global unique features returnmap[category][UNIQUE_FEATURES] = unique_features_dict return returnmap
def performTraining(data_filename, label_filename, features_used=FEATURES_DEFAULT, job_id=None): try: # extract & preprocess features debugPrint("feature extraction and preprocessing...") if job_id != None: connection.close() jobstatus = JobStatus.objects.get(id=job_id) updateJobStatus(jobstatus, "Acquiring Key info") gen = parseLabelFile(label_filename) categories_list = gen['categories'] groundtruth_list = gen['groundtruth_list'] all_keyinfo = getKeyInfoForClassifier(data_filename, categories_list, groundtruth_list, features_used) for category, keyinfo in all_keyinfo.iteritems(): debugPrint("training category: %s" % category) if job_id != None: updateJobStatus(jobstatus, "Training category: %s" % (category)) pos_tweets = keyinfo[POSITIVE][PROCESSED_TWEETS] pos_sample_size = len(pos_tweets) neg_tweets = keyinfo[NEGATIVE][PROCESSED_TWEETS] neg_sample_size = len(neg_tweets) # feature selection debugPrint(">> feature selection") if job_id != None: updateJobStatus( jobstatus, "Feature selection on category: %s" % (category)) select_results = selectTrainingFeaturesAndNegSamples( keyinfo, features_used, pos_sample_size, category) selected_feat = select_results[0] selected_neg_tweets = select_results[1] # writeDebugListToFile("%s_selected_feat.txt" % category, selected_feat) writeDebugListToFile("%s_selected_neg_tweets.txt" % category, selected_neg_tweets) writeDebugListToFile("%s_pos_tweets.txt" % category, pos_tweets) # create feature matrix for each tweet debugPrint(">> get feature matrix") training_tweets = { POSITIVE: pos_tweets, NEGATIVE: selected_neg_tweets } feature_matrix = getFeatureMatrix(category, training_tweets, selected_feat, features_used) writeDebugCountDictToFile("%s_feature_to_id_map.txt" % category, feature_matrix.feature_to_id_map) writeDebugListToFile("%s_tweet_feature_ids_list.txt" % category, feature_matrix.tweet_feature_ids_list) debugPrint('feature count: %s' % len(feature_matrix.feature_to_id_map)) debugPrint("positive tweets count: %s" % pos_sample_size) debugPrint("negative tweets count: %s" % len(selected_neg_tweets)) # create svm matrix debugPrint(">> create svm matrix") if job_id != None: updateJobStatus( jobstatus, "Creating SVM Matrix for category %s" % (category)) svm_matrix = getSVMMatrixForClassification(feature_matrix) writeDebugListToFile("%s_svm_matrix_X.txt" % category, svm_matrix[SVM_X]) writeDebugListToFile("%s_svm_matrix_Y.txt" % category, svm_matrix[SVM_Y]) createSVM(category, feature_matrix, svm_matrix) debugPrint("training completed for category: %s" % category) if job_id != None: updateJobStatus( jobstatus, "Training completed for category: %s" % (category)) if job_id != None: updateJobStatus(jobstatus, "Completed!") except: traceback.print_exc(file=open("%s/svmstates/errlog.txt" % (BASE_DIR), "a"))