vect_train = vect_train.todense() vectoriser.vocabulary_ text_test = dfreduced['twitter.text'] text_test = text_test.apply(lambda x: x.lower()) text_test = text_test.apply(lambda x: x.replace('#', '')) text_test = text_test.apply(lambda x: x.replace('@', '')) vect_test = vectoriser.transform(list(text_test)) vect_test = vect_test.todense() vect_test #Make into df colnames = vectoriser.vocabulary_ df_features_reduced = pd.DataFrame(vect_test, columns=colnames) df_features_reduced.head() #Merge back to original df dfreduced_added = pd.merge(dfreduced, df_features_reduced, how='inner', left_index=True, right_index=True) dfreduced_added.info() len(dfreduced_added.columns) #write to db - has problem. Have to recomple sqlite with higher col number dfreduced_added = dfreduced_added.drop('twitter.text', 1) pu.toDB(con, dfreduced_added, 'FeaturesReduced3000T10000FwithWords') dfreduced_added.columns pu.to_weka(dfreduced_added, outfile='features_reduced.csv')
vect_test = vectoriser.transform(list(text_test)) vect_test = vect_test.todense() vect_test #Make into df colnames = vectoriser.vocabulary_ df_features_reduced = pd.DataFrame(vect_test, columns = colnames) df_features_reduced.head() #Merge back to original df dfreduced_added = pd.merge(dfreduced,df_features_reduced,how='inner',left_index=True,right_index=True) dfreduced_added.info() len(dfreduced_added.columns) #write to db - has problem. Have to recomple sqlite with higher col number dfreduced_added = dfreduced_added.drop('twitter.text',1) pu.toDB(con, dfreduced_added, 'features_training3000T10000FwithWords') dfreduced_added.columns pu.to_weka(dfreduced_added, outfile='features_reduced.csv') #Evaluate and look at predicted tf #Output just match_rowid + text + t|f
text_test = text_test.apply( lambda x: x.replace('#','')) text_test = text_test.apply( lambda x: x.replace('@','')) vect_test = vectoriser.transform(list(text_test)) vect_test = vect_test.todense() vect_test #Make into df colnames = vectoriser.vocabulary_ df_features_reduced = pd.DataFrame(vect_test, columns = colnames) df_features_reduced.head() #Merge back to original df dfreduced_added = pd.merge(dfreduced,df_features_reduced,how='inner',left_index=True,right_index=True) dfreduced_added.info() len(dfreduced_added.columns) #write to db - has problem. Have to recomple sqlite with higher col number dfreduced_added = dfreduced_added.drop('twitter.text',1) pu.toDB(con, dfreduced_added, 'FeaturesReduced3000T10000FwithWords') dfreduced_added.columns pu.to_weka(dfreduced_added, outfile='features_reduced.csv')