Exemplo n.º 1
0
	words=doc_class[0].strip().split(' ')
	#create a binary vector for the document with all the words that appear (0:does not appear,1:appears)
	#we can set in a dictionary only the indexes of the words that appear
	#and we can use that to build a SparseVector
	vector_dict={}
	for w in words:
		vector_dict[dictionary[w]]=1
	return LabeledPoint(doc_class[1], SparseVector(len(dictionary),vector_dict))
def Predict(name_text,dictionary,model):
	words=name_text[1].strip().split(' ')
	vector_dict={}
	for w in words:
		if(w in dictionary):
			vector_dict[dictionary[w]]=1
	return (name_text[0], model.predict(SparseVector(len(dictionary),vector_dict)))
data,Y=lf.loadLabeled("./data/train")
print len(data)
dataRDD=sc.parallelize(data,numSlices=16)
#map data to a binary matrix
#1. get the dictionary of the data
#The dictionary of each document is a list of UNIQUE(set) words 
lists=dataRDD.map(lambda x:list(set(x.strip().split(' ')))).collect()
all=[]
#combine all dictionaries together (fastest solution for Python)
for l in lists:
	all.extend(l)
dict=set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
dictionary={}
for i,word in enumerate(dict):
Exemplo n.º 2
0
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

#Other
import time
import scipy.sparse as sp

import matplotlib.pyplot as plt
import sys
import loadFiles as lf



sliding_window = 2
#%% Load the data and compute alternative features
data, labels = lf.loadLabeled("./train")


#%%Pre process

#Remove html tags
train = ct.removehtml(data)

#Create the dictionnary
data=ct.stemTokenize(train)  
idfs = {}
num_documents = len(data)
print "number of documents %s"%num_documents


sc = SparkContext(appName="Simple App")  #initialize the spark context
#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("helpers.py") 
sc.addFile("exctract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import extract_terms as et



# load data : data is a list with the text per doc in each cell. Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF) 

print "preprocessing"
pp.proc(data) #clean up the data from  number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function 

'''
we need an array to distribute to the workers ...
the array should be the same size as the number of workers
we need one element per worker only
'''
ex=np.zeros(8) 
rp=randint(0,7)
ex[rp]=1 #one random worker will be selected so we set one random element to non-zero

md=sc.broadcast(m) #broadcast the vectorizer so that he will be available to all workers
Exemplo n.º 4
0
        vector_dict[dictionary[w]] = 1
    return LabeledPoint(doc_class[1], SparseVector(len(dictionary),
                                                   vector_dict))


def Predict(name_text, dictionary, model):
    words = name_text[1].strip().split(' ')
    vector_dict = {}
    for w in words:
        if (w in dictionary):
            vector_dict[dictionary[w]] = 1
    return (name_text[0],
            model.predict(SparseVector(len(dictionary), vector_dict)))


data, Y = lf.loadLabeled("./data/train")
#data,Y=lfp.loadLabeled("./data/train",1000)
print len(data)
dataRDD = sc.parallelize(data, numSlices=16)

#map data to a binary matrix
#1. get the dictionary of the data
#The dictionary of each document is a list of UNIQUE(set) words
lists = dataRDD.map(lambda x: list(set(x.strip().split(' ')))).collect()
all = []
#combine all dictionaries together (fastest solution for Python)
for l in lists:
    all.extend(l)
dict = set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
Exemplo n.º 5
0
    words = [w for w in words if not any(i in w for i in ['/', '<', '>'])]
    return words


def FinalPredict(name_text, dictionary, model):
    #words=name_text[1].strip().split(' ')
    words = doc2words(name_text[1])
    vector_dict = {}
    for w in words:
        if (w in dictionary):
            vector_dict[dictionary[w]] = 1
    return (name_text[0],
            model.predict(SparseVector(len(dictionary), vector_dict)))


data, Y = lf.loadLabeled("/Users/sofia/Desktop/big-data-project/data/train")
print "data loaded, length {}".format(len(data))
dataRDD = sc.parallelize(data, numSlices=16)
lists = dataRDD.map(doc2words)
#lists=dataRDD.map(doc2words).collect()

# create dict
all = []
for l in lists.collect():
    all.extend(l)
dict = set(all)

# TF-IDF
hashingTF = HashingTF(numFeatures=len(dict))
tf = hashingTF.transform(lists)
tf.cache()
import loadFiles as lf

### warning here
import extract_terms as et
import preProcess as pp
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as nps

#### TODO change to the cluster folders small
data,Y=lf.loadLabeled("D://Data//DSSP//Data Camp 2//xcode//trainsmall")

#/home/xavier.callens/DataCamp
#data,Y=lf.loadLabeled("/home/xavier.callens/DataCamp/train")

print data
#analyzer and preprocessor dont work together...one or the other
pp.proc(data) # we apply the preprocessing by ourselves
m = TfidfVectorizer(analyzer=et.terms)

tt = m.fit_transform(data)
print tt
rows,cols=tt.shape
print "number of features :" +str(len(m.get_feature_names())) #this is the same as the number of columns
#the function get_feature_names() returns a list of all the terms found 

print "non compressed matrix expected size:" + str(rows*cols*8/(1024*1024*1024))+"GB"