Exemplo n.º 1
0
from sklearn.feature_extraction.text import CountVectorizer
os.getcwd()
os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing')

#if __name__ == '__main__':

path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path + dbfile, detect_types=sqlite3.PARSE_DECLTYPES)
#pu.showTables(con)

startDate = "2011-03-03"
endDate = "2015-03-05"
#First make vocab from reduced features with which we'll train
dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate)
#maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text'])
#dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid')
#pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT')

fin = open('Results/unique.txt')
text_train = fin.readlines()
fin.close()

#This is fitting vocab
vectoriser = CountVectorizer(min_df=1, stop_words='english')
vect_train = vectoriser.fit_transform(text_train)
#Change from sparse matrix to dense matrix
vect_train = vect_train.todense()
vectoriser.vocabulary_
from sklearn.feature_extraction.text import CountVectorizer
os.getcwd()
os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing')

#if __name__ == '__main__':
        
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path+ dbfile, detect_types=sqlite3.PARSE_DECLTYPES)
#pu.showTables(con)

startDate = "2011-03-03"
endDate = "2015-03-05"
#First make vocab from reduced features with which we'll train
dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate)
#maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text'])
#dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid')
#pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT')

fin = open('Results/unique.txt')
text_train = fin.readlines()
fin.close()

#This is fitting vocab
vectoriser = CountVectorizer(min_df=1,stop_words='english')
vect_train = vectoriser.fit_transform(text_train)
#Change from sparse matrix to dense matrix
vect_train = vect_train.todense()
vectoriser.vocabulary_
Exemplo n.º 3
0
import pandas.io.sql as psql
import PSQLUtils
reload(PSQLUtils)
import sqlite3

import pickle



pd.set_printoptions(max_colwidth = 400)
pd.set_option(max_colwidth = 400)
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path + dbfile)
PSQLUtils.showTables(con, display=True)
df = PSQLUtils.readDB(con,'FeaturesReduced3000T10000FwithWords', 0,0)


#Splitting data to features and target
len(df.columns)
target = df.pop('Newsworthy')
len(df.columns)
features = df
features.pop('rowid')
features.pop('match_rowid')



#Find char features
typeFirstCol = [type(it) for it in features.values[0]]
#Select just the unicodes
Exemplo n.º 4
0
from sklearn import datasets, svm, metrics

import pandas.io.sql as psql
import PSQLUtils
reload(PSQLUtils)
import sqlite3

import pickle

pd.set_printoptions(max_colwidth=400)
pd.set_option(max_colwidth=400)
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path + dbfile)
PSQLUtils.showTables(con, display=True)
df = PSQLUtils.readDB(con, 'FeaturesReduced3000T10000FwithWords', 0, 0)

#Splitting data to features and target
len(df.columns)
target = df.pop('Newsworthy')
len(df.columns)
features = df
features.pop('rowid')
features.pop('match_rowid')

#Find char features
typeFirstCol = [type(it) for it in features.values[0]]
#Select just the unicodes
indofunicode = typeFirstCol.index(unicode)
indofunicode
##ConvertThese to nominal