Exemplos de loadLabeled em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: loadFiles

Método / Função: loadLabeled

Exemplos em hotexamples.com: 6

loadLabeled em Python - 6 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de loadFiles.loadLabeled em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Relacionados

ifloor

read_dip

baseModule_BIND_PYTHON

review

template_path

get_date_range

register_assume_role_provider

R_RECOMMENTATION

calcString

run

Related in langs

IndexedArray (PHP)

QuizItem (PHP)

PrelevementDAO (C#)

switch_xml_parse_section_string (C++)

TCOD_map_set_properties (C++)

GenGkErr (Go)

pn_data_get_uint (Go)

Map (Java)

OrderDetailBean.DataEntity (Java)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: main.py Projeto: GuillaumeCarbajal/AdvBigData

words=doc_class[0].strip().split(' ') #create a binary vector for the document with all the words that appear (0:does not appear,1:appears) #we can set in a dictionary only the indexes of the words that appear #and we can use that to build a SparseVector vector_dict={} for w in words: vector_dict[dictionary[w]]=1 return LabeledPoint(doc_class[1], SparseVector(len(dictionary),vector_dict)) def Predict(name_text,dictionary,model): words=name_text[1].strip().split(' ') vector_dict={} for w in words: if(w in dictionary): vector_dict[dictionary[w]]=1 return (name_text[0], model.predict(SparseVector(len(dictionary),vector_dict))) data,Y=lf.loadLabeled("./data/train") print len(data) dataRDD=sc.parallelize(data,numSlices=16) #map data to a binary matrix #1. get the dictionary of the data #The dictionary of each document is a list of UNIQUE(set) words lists=dataRDD.map(lambda x:list(set(x.strip().split(' ')))).collect() all=[] #combine all dictionaries together (fastest solution for Python) for l in lists: all.extend(l) dict=set(all) print len(dict) #it is faster to know the position of the word if we put it as values in a dictionary dictionary={} for i,word in enumerate(dict):

Exemplo n.º 2

0

Exibir arquivo

Arquivo: GoWmodel.py Projeto: wgharbi/OpinionMining

from sklearn.metrics import accuracy_score from sklearn.cross_validation import train_test_split #Other import time import scipy.sparse as sp import matplotlib.pyplot as plt import sys import loadFiles as lf sliding_window = 2 #%% Load the data and compute alternative features data, labels = lf.loadLabeled("./train") #%%Pre process #Remove html tags train = ct.removehtml(data) #Create the dictionnary data=ct.stemTokenize(train) idfs = {} num_documents = len(data) print "number of documents %s"%num_documents

Exemplo n.º 3

0

Exibir arquivo

Arquivo: evaluationSVM.py Projeto: xaviercallens/MLSentimentAnalysisDataCamp

sc = SparkContext(appName="Simple App") #initialize the spark context #since we are not in the command line interface we need to add to the spark context #some of our classes so that they are available to the workers sc.addFile("helpers.py") sc.addFile("exctract_terms.py") #now if we import these files they will also be available to the workers from helpers import * import extract_terms as et # load data : data is a list with the text per doc in each cell. Y is the respective class value #1 :positive , 0 negative print "loading local data" data,Y=lf.loadLabeled(trainF) print "preprocessing" pp.proc(data) #clean up the data from number, html tags, punctuations (except for "?!." ...."?!" are replaced by "." m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function ''' we need an array to distribute to the workers ... the array should be the same size as the number of workers we need one element per worker only ''' ex=np.zeros(8) rp=randint(0,7) ex[rp]=1 #one random worker will be selected so we set one random element to non-zero md=sc.broadcast(m) #broadcast the vectorizer so that he will be available to all workers

Exemplo n.º 4

0

Exibir arquivo

vector_dict[dictionary[w]] = 1 return LabeledPoint(doc_class[1], SparseVector(len(dictionary), vector_dict)) def Predict(name_text, dictionary, model): words = name_text[1].strip().split(' ') vector_dict = {} for w in words: if (w in dictionary): vector_dict[dictionary[w]] = 1 return (name_text[0], model.predict(SparseVector(len(dictionary), vector_dict))) data, Y = lf.loadLabeled("./data/train") #data,Y=lfp.loadLabeled("./data/train",1000) print len(data) dataRDD = sc.parallelize(data, numSlices=16) #map data to a binary matrix #1. get the dictionary of the data #The dictionary of each document is a list of UNIQUE(set) words lists = dataRDD.map(lambda x: list(set(x.strip().split(' ')))).collect() all = [] #combine all dictionaries together (fastest solution for Python) for l in lists: all.extend(l) dict = set(all) print len(dict) #it is faster to know the position of the word if we put it as values in a dictionary

Exemplo n.º 5

0

Exibir arquivo

words = [w for w in words if not any(i in w for i in ['/', '<', '>'])] return words def FinalPredict(name_text, dictionary, model): #words=name_text[1].strip().split(' ') words = doc2words(name_text[1]) vector_dict = {} for w in words: if (w in dictionary): vector_dict[dictionary[w]] = 1 return (name_text[0], model.predict(SparseVector(len(dictionary), vector_dict))) data, Y = lf.loadLabeled("/Users/sofia/Desktop/big-data-project/data/train") print "data loaded, length {}".format(len(data)) dataRDD = sc.parallelize(data, numSlices=16) lists = dataRDD.map(doc2words) #lists=dataRDD.map(doc2words).collect() # create dict all = [] for l in lists.collect(): all.extend(l) dict = set(all) # TF-IDF hashingTF = HashingTF(numFeatures=len(dict)) tf = hashingTF.transform(lists) tf.cache()

Exemplo n.º 6

0

Exibir arquivo

Arquivo: tfidfTESTLocal.py Projeto: xaviercallens/MLSentimentAnalysisDataCamp

import loadFiles as lf ### warning here import extract_terms as et import preProcess as pp from sklearn.feature_extraction.text import TfidfVectorizer import numpy as nps #### TODO change to the cluster folders small data,Y=lf.loadLabeled("D://Data//DSSP//Data Camp 2//xcode//trainsmall") #/home/xavier.callens/DataCamp #data,Y=lf.loadLabeled("/home/xavier.callens/DataCamp/train") print data #analyzer and preprocessor dont work together...one or the other pp.proc(data) # we apply the preprocessing by ourselves m = TfidfVectorizer(analyzer=et.terms) tt = m.fit_transform(data) print tt rows,cols=tt.shape print "number of features :" +str(len(m.get_feature_names())) #this is the same as the number of columns #the function get_feature_names() returns a list of all the terms found print "non compressed matrix expected size:" + str(rows*cols*8/(1024*1024*1024))+"GB"