def Analyze_SubTopic_Scores(metadata, bidx, cl, outfile, **CVargs): ''' grid_search reveals little variation. Linear 100 or linear 10 seem best, but not a huge effect Number of cases doesn't matter for MNB because most are small sample sizes (largest is 6,000). ''' print 'SUBTOPIC ANALYSIS' #num=1000 #metadata=ImportMeta(-1) path = 'Twitter/Data/' PREDS = {} for cat in set([line[1] for line in metadata.values()]): if cat == 'category' or cat == 'party': continue if cat == 'Student' or cat == 'indUnk': args = {'n_iter': 20, 'test_size': .9, 'random_state': 0} else: args = CVargs.copy() print 'RUNNINING ', cat, ' SUBTOPIC SCORES' f = 'Twitter_' + cat + '_Topic_Scores.csv' data = ImportCSVFeatureData(path + f, -1) vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd PREDS.update(preds) Write_Scores(PREDS, ['id', 'subtopic_score'], outfile) return
def Analyze_KBest_Scores(metadata, bidx, cl, outfile, **CVargs): filename = 'KBest' vec, ids, words = importArray(filename) labels = np.array([metadata[idx][0] for idx in ids]) # if 'age' not in line]) IDX = np.array(ids) # #filename='Twitter/Data/Twitter_KBest_Scores.csv' #data=ImportCSVFeatureData(filename,-1) #print 'drawing samples' #vec=np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line]) #IDX=np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) print 'drawing samples' labels = np.array([metadata[idx][0] for idx in ids]) # if 'age' not in line]) IDX = np.array(ids) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Brown/Results/Raw_Preds.csv' Write_Scores(preds, ['id', 'kbest_score'], outfile) return
def Analyze_Nonword_Scores(metadata, bidx, cl, outfile, **CVargs): ''' check rows 1535 and 15349 for inf data. Should no longer have to recode 8 and 12 (herndanV and LnM) ''' print 'NONWORD ANALYSIS' #metadata=ImportMeta(-1) filename = 'Twitter/Data/Twitter_Nonword_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown #vec[:,8]=vec[:,8]*-1 #herndanV is always neg (changed in Make_Twitter_Data now) #vec[:,12]=vec[:,12]*-1 #LnM is always neg (changed in Make_Twitter_Data now) labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Nonwords_Preds.csv' Write_Scores(preds, ['id', 'nonword_score'], outfile) return
def Analyze_Raw_Topic_Scores(metadata, bidx, cl, outfile, **CVargs): ''' grid_search shows C>=1 is ideal. remains 71% from 500 through 7000 remains at 71% at sample sizes from 500 through 10000. ''' print 'RAW TOPIC ANALYSIS' #metadata=ImportMeta(-1) filename = 'Twitter/Data/Raw_Topic_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Raw_Topic_Preds.csv' Write_Scores(preds, ['id', 'rawTopic_score'], outfile) return
def Analyze_Raw(metadata, bidx, cl, outfile, **CVargs): ''' mnb max's out at 69/70% accurate at 3,000 (or 600 training) texts. Does not increase in accuracy after that. svm: grid search showed ideal is linear kernal with C=1,10, or 100; also max's out at 74% accurate for 3,000 (goes to 76 at 10,000) ''' print 'running Raw analysis' #metadata=ImportMeta(-1) filename = 'Raw' vec, ids, words = importArray(filename) print 'drawing samples' #vec=data[0:,1:] #grab all but zeroth column #labels=data[0:,0] #grab all of zeroth column labels = np.array([metadata[idx][0] for idx in ids]) # if 'age' not in line]) IDX = np.array(ids) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Raw_Preds.csv' Write_Scores(preds, ['id', 'raw_score'], outfile) return
def Analyze_Individual(metadata, bidx, cl, outfile, **CVargs): ''' grid search shows C>=1 is optimal accuracy is unrelated to sample size (remains 84-89% throughout) ''' print 'INDIVIDUAL ANALYSIS' #metadata=ImportMeta(-1) filename = 'Twitter/Data/Twitter_Individual_Scores.txt' data = ImportFeatureData(filename, -1) vec = np.array([line[2:] for line in data if line[1] != 1.0 ]) #exclude cases where sex is never mentioned labels = np.array([ metadata[line[0]][0] for line in data if line[1] != 1.0 ]) # if 'age' not in line]) IDX = np.array([line[0] for line in data if line[1] != 1.0]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) print 'standardizing scores' preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='Individual_Preds.csv' Write_Scores(preds, ['id', 'indiv_score'], outfile) return
def Analyze_LIWC(metadata, bidx, cl, outfile, **CVargs): filename = 'Twitter/Data/Twitter_LIWC_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd #fname='LIWC_Preds.csv' Write_Scores(preds, ['id', 'liwc_score'], outfile) return
def hybridTrial(metadata): ''' This code takes two above feature sets and tests whether they change their collective and individual predictability Raw * Raw Topics = ? * .72 = .71 (No change in nb score) Subtopics * raw topics = .65 * .72 = .69 ''' print 'import raw topic scores' filename = 'Twitter/Data/Raw_Topic_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) print 'CV for RAW TOPICS' CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0} cl = mnb() #Preds=Classifiers.CrossValidate(vec,labels,IDX,cl,**CVargs) print 'importing subtopic scores' path = 'Twitter/Data/' preds = {} Data = [] for cat in set([line[2] for line in metadata.values()]): if cat == 'category' or cat == 'party': continue print 'RUNNINING ', cat, ' SUBTOPIC SCORES' f = 'Twitter_' + cat + '_Topic_Scores.csv' data = ImportCSVFeatureData(path + f, -1) Data.append(data) #for line in data: # for idx in IDX: # if line[0]==idx: # rvec.append(line) # break #vec=np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line]) #IDX=np.array([line[0] for line in data]) print 'resorting cases to align with labels' rvec = [[] for i in IDX] #rlabels=[] for data in Data: for i, idx in enumerate(IDX): if idx in [line[0] for line in data]: for line in data: if idx == line[0]: rvec[i] += line[1:] continue #rvec.append(line[1:]) else: rvec[i] += [0 for i in data[0][1:]] #used to align RAW #for idx in IDX: # if line[0]==idx: # rvec.append(line[1:]) # break #rlabels.append(meta-data[str(int(idx))][0]) rvec = np.append(vec, np.array(rvec), axis=1) print 'crossvalidate testing COMBINATION' CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0} cl = mnb() Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0} cl = mnb() cl = ensemble.AdaBoostClassifier(n_estimators=10) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) return
def Analyze_Behavior_Scores(metadata, bidx, cl, outfile, **CVargs): ''' grid_search across all seems to agree that C==10,000 for linear or rbf is optimal Sample size doesn't matter because number of texts in an area max out at 6,000 ''' print 'BEHAVIOR ANALYSIS' #num=1000 #metadata=ImportMeta(-1) path = 'Twitter/Data/' PREDS = {} for cat in set([line[1] for line in metadata.values()]): if cat == 'category' or cat == 'party': continue #if cat=='Student' or cat=='indUnk': # args={'n_iter':20, 'test_size':.9,'random_state':0} else: args = CVargs.copy() print 'RUNNINING ', cat, ' BEHAVIOR SCORES' f = 'Twitter_Behavior_' + cat + '_Scores.csv' data = ImportCSVFeatureData(path + f, -1) vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args) preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): preds[k] = (score - m) / sd PREDS.update(preds) #fname='Behavior_Preds.csv' Write_Scores(PREDS, ['id', 'behavior_score'], outfile) return
def Analyze_Structure_Scores(metadata, bidx, cl, outfile, **CVargs): ''' SVM - should be linear and C =10. Accuracy maxes out around 3,000 at ~62% but doesn't grow much from 500 MNB - sample size appears to be unrelated to accuracy. Hovers at 58-62% throughout. ''' print 'STRUCTURE ANALYSIS' #metadata=ImportMeta(-1) path = 'Twitter/Data/' PREDS = dict(zip(metadata.keys(), [[] for i in metadata.keys()])) for cat in set([line[1] for line in metadata.values()]): if cat == 'category' or cat == 'party': continue #if cat=='Student' or cat=='indUnk': #For big categories, use a different test conditions # args={'n_iter':20, 'test_size':.9,'random_state':0} else: args = CVargs.copy() print 'RUNNINING ', cat, ' STRUCTURE SCORES' f = 'Twitter_Structure_' + cat + '_Scores.csv' data = ImportCSVFeatureData(path + f, -1) vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) vec, labels, IDX = balance(vec, labels, IDX, bidx) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args) preds = {} for k, score in Preds.iteritems(): if np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: max(x) except: continue x.append(max(x)) preds[k] = np.mean(x) elif -1 * np.inf in score: original = len(score) x = list(np.array(score)[np.logical_not(np.isinf(score))]) try: min(x) except: continue x.append(min(x)) preds[k] = np.mean(x) else: preds[k] = np.mean(score) m = np.mean(preds.values()) sd = np.std(preds.values()) for k, score in preds.iteritems(): PREDS[k].append((score - m) / sd) #This uses a bagging model to score masculine or feminine for k, scores in PREDS.iteritems(): PREDS[k] = np.mean(scores) Write_Scores(PREDS, ['id', 'struct_score'], outfile) return