class Searcher(): def __init__(self): self.conf = SparkConf().setMaster("local").setAppName("Searcher") self.sc = SparkContext(conf=self.conf) def load_data(self, data_file): raw_data = self.sc.textFile(data_file) fields = raw_data.map(lambda x: x.split("\t")) self.documents = fields.map(lambda x: x[3].split(" ")) self.document_names = fields.map(lambda x: x[1]) def hashing(self, size): self.hashing_TF = HashingTF( size) #100K hash buckets just to save some memory tf = self.hashing_TF.transform(self.documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) self.tfidf = idf.transform(tf) def search(self, search_text): search_text_TF = self.hashing_TF.transform([search_text]) search_text_hash_value = int(search_text_TF.indices[0]) search_text_relevance = self.tfidf.map( lambda x: x[search_text_hash_value]) return search_text_relevance.zip(self.document_names)
def main(): """ Driver program for a spam filter using Spark and MLLib """ # Create the Spark Context for parallel processing sc = SparkContext(appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20050311_spam_2.tar\\20050311_spam_2\\spam.txt" ) ham = sc.textFile( "E:\\Personal\\Imp Docs\\Spark Projects\\Spam-Ham\\20030228_easy_ham.tar\\20030228_easy_ham\\ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures=10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map( lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map( lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict(email.features))) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred).count() / float(testData.count()) # End the Spark Context sc.stop() # Print out the error rate print("*********** SPAM FILTER RESULTS **********") print("\n") print("Error Rate: " + str(error_rate)) print("\n") # Serialize the model for presistance pickle.dump(model, open("spamFilter.pkl", "wb"))
def init_tranining_set(sc): """ 合并积极/消极的词性 param: sc spark对象的context """ words = sc.textFile('traning_words.csv') # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() with open('NBmodel.pkl', 'r') as f: NBmodel = pickle.load(f) session = get_session(settings.DB_URL) for r in session.execute('select * from traning_collection').fetchall(): yourDocument = r[3] print r[3] yourwords="/".join(jieba.cut_for_search(yourDocument)).split("/") yourtf = hashingTF.transform(yourwords) yourtfidf=idfModel.transform(yourtf) print('NaiveBayes Model Predict:', NBmodel.predict(yourtfidf))
def analyse_data(self, data): """ 针对入口数据进行合适的分析 param data: file, unicode, str """ words = self.sc.textFile(self.training_words_dir) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() with open(self.NBmodel, 'r') as f: NBmodel = pickle.load(f) # 先分词后分析 yourwords = set("/".join(jieba.cut_for_search(data)).split("/")) print '分词结果:{}'.format(yourwords) yourtf = hashingTF.transform(yourwords) yourtfidf = idfModel.transform(yourtf) return NBmodel.predict(yourtfidf), data
def column_search(words,row_filter): if row_filter == 'n' or row_filter == 'N': min_row = 0 else: min_row = row_filter rawData = table_cols.join(master_index, master_index["Table_Name"]==table_cols["Name"]).rdd data = rawData.map(lambda x: (x['Doc_ID'], x['Columns'])).map(parse) titles = data.map(lambda x: x[0]) documents = data.map(lambda x: x[1]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) normalizer = Normalizer() tfidf = normalizer.transform(idf.transform(tf)) tfidfData = titles.zip(tfidf).toDF(["label", "features"]) query = parse((0, words))[1] queryTF = hashingTF.transform(query) queryTFIDF = normalizer.transform(idf.transform(queryTF)) queryRelevance = tfidfData.rdd.map(lambda x: (x[0], float(x[1].dot(queryTFIDF)))).sortBy(lambda x: -x[1]).filter(lambda x: x[1] > 0) queryRelevance = queryRelevance.toDF(["Doc_ID", "scores"]) queryRelevance = queryRelevance.join(table_desc,queryRelevance.Doc_ID == table_desc.Doc_ID).select(table_desc.Doc_ID, queryRelevance.scores, table_desc.Columns) queryRelevance = queryRelevance.join(master_index, master_index.Doc_ID==queryRelevance.Doc_ID).select(master_index.Table_Name,master_index.Table_Length, queryRelevance.Columns, queryRelevance.scores) queryRelevance = queryRelevance.rdd.filter(lambda x: int(x['Table_Length']) >= int(min_row)) if (queryRelevance.isEmpty()): print("Sorry, nothing matched in column search, please try a different keyword") else: print("Here is your column search result") queryRelevance.toDF().show() '''
def task2(): #Print title with Machine Learning Classification print("-------------------------------------------") startTitle = time.time() regex1 = re.compile(".*(title:).*") find1 = [m.group(0) for l in data for m in [regex1.search(l)] if m] title = [i.split('title: ', 1)[1] for i in find1] Programming = sc.textFile(fileProgramming) Other = sc.textFile(fileOther) # Create a HashingTF instance to map title text to vectors of 100,000 features. tf = HashingTF(numFeatures=100000) # Each title is split into words, and each word is mapped to one feature. programmingFeatures = Programming.map( lambda title: tf.transform(title.split(" "))) otherFeatures = Other.map(lambda title: tf.transform(title.split(" "))) # Create LabeledPoint datasets for positive (programming) and negative (other) examples. positiveExamples = programmingFeatures.map( lambda features: LabeledPoint(1, features)) negativeExamples = otherFeatures.map( lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache() # Run Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) listResult = [] for row in title: test = tf.transform(row.split(" ")) result = "null" if model.predict(test) == 1: result = "Programmings" else: result = "Non-Programming" joinResult = row + " = " + result listResult.append(joinResult) for i in listResult: if 'Non-Programming' in i: print(i) for i in listResult: if 'Programmings' in i: print(i) endTitle = time.time() elapsedTitle = endTitle - startTitle print(elapsedTitle) print("-------------------------------------------")
def main(): """ Driver program for a spam filter using Spark and MLLib """ # Consolidate the individual email files into a single spam file # and a single ham file makeDataFileFromEmails( "data/spam_2/", "data/spam.txt") makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" ) # Create the Spark Context for parallel processing sc = SparkContext( appName="Spam Filter") # Load the spam and ham data files into RDDs spam = sc.textFile( "data/spam.txt" ) ham = sc.textFile( "data/ham.txt" ) # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (ham) data points. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) # Combine positive and negative datasets into one data = positiveExamples.union(negativeExamples) # Split the data into 70% for training and 30% test data sets ( trainingData, testData ) = data.randomSplit( [0.7, 0.3] ) # Cache the training data to optmize the Logistic Regression trainingData.cache() # Train the model with Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Create tuples of actual and predicted values labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) ) # Calculate the error rate as number wrong / total number error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() ) print( "*********** SPAM FILTER RESULTS **********" ) print( "\n" ) print( "Error Rate: " + str( error_rate ) ) print( "\n" ) # Serialize the model for presistance pickle.dump( model, open( "spamFilter.pkl", "wb" ) ) sc.stop()
def main(): sc = SparkContext(appName="BayesClassifer") htf = HashingTF(50000) data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv') data_cleaned = data.map(lambda line : line.split(",")) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text))) data_hashed.persist() # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine) #print data # Split data aproximately into training (60%) and test (40%) training, test = data_hashed.randomSplit([0.70, 0.30], seed=0) sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel") print "----------" print sameModel.predict(htf.transform("posts jump in net profit")) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label)) prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count() buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count() # Instantiate metrics object # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabel) # Overall statistics precision = metrics.precision() precision = normalize(precision) recall = metrics.recall() recall = normalize(recall) f1Score = metrics.fMeasure() f1Score = normalize(f1Score) print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) ''' # Statistics by class labels = data_hashed.map(lambda lp: lp.label).distinct().collect() for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label))) print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) ''' '''
def entrenar_spam(sc, sql_context, dir_spam, dir_no_spam, num_trees=20, max_depth=8): input_spam = sc.textFile(dir_spam) input_no_spam = sc.textFile(dir_no_spam) spam = sql_context.read.json(input_spam).select("text").withColumn( "label", F.lit(1.0)) no_spam = sql_context.read.json(input_no_spam).select("text").withColumn( "label", F.lit(0.0)) training_data = spam.unionAll(no_spam) tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(training_data) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=140) featurizedData = hashingTF.transform(wordsData) """idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData)""" seed = 1800009193L (split_20_df, split_80_df) = featurizedData.randomSplit([20.0, 80.0], seed) test_set_df = split_20_df.cache() training_set_df = split_80_df.cache() rf = RandomForestClassifier().setLabelCol("label") \ .setPredictionCol("predicted_label") \ .setFeaturesCol("rawFeatures") \ .setSeed(100088121L) \ .setMaxDepth(max_depth) \ .setNumTrees(num_trees) rf_pipeline = Pipeline() rf_pipeline.setStages([rf]) reg_eval = MulticlassClassificationEvaluator( predictionCol="predicted_label", labelCol="label", metricName="accuracy") crossval = CrossValidator(estimator=rf_pipeline, evaluator=reg_eval, numFolds=5) param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build() crossval.setEstimatorParamMaps(param_grid) modelo = crossval.fit(training_set_df).bestModel predictions_and_labels_df = modelo.transform(test_set_df) accuracy = reg_eval.evaluate(predictions_and_labels_df) return modelo, accuracy
def main(): # 初始化 SparkContext sc = spark_context(spark_master) # 读取文件 data = sc.textFile(hdfs_path) # 分词 documents = data.map(tokenize) documents.cache() # TF hashingTF = HashingTF() tf = hashingTF.transform(documents) # IDF idf = IDF(minDocFreq=2).fit(tf) # TFIDF tfidf = idf.transform(tf) # 链接到 MongoDB from pymongo import MongoClient mongo_client = MongoClient(mongo_host) mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism='SCRAM-SHA-1') clear_mongodb(mongo_client) # zip term_tfidf = documents.zip(tfidf).map(doc_tfidf) articles = term_tfidf.flatMap(lambda i: i).reduceByKey(lambda x, y: x + y) for article in articles.collect(): item = {} item['text'] = article[0].encode('utf-8') item['size'] = int(article[1] * 10) send_mongodb(mongo_client, item)
def tfidf(self): self._create_rdd() hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return tfidf
def createHashData(rdd): original = rdd.map(lambda line: line.split(", ")) # load up the json string data = rdd.map(lambda line: line.split(", ")).collect() def fn(line): label = 0.0 if line[9] == 'title': label = 1.0 return (label, data[0:9]) # create paired data data_pared = original.map(fn) print data_pared htf = HashingTF(100) # hash data data_hashed = data_pared.map( lambda (label, f): LabeledPoint(label, htf.transform(f))) return data_hashed
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc,"tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def process_data(data): print("Processing data ...") if (not data.isEmpty()): nbModel = bc_model.value hashingTF = HashingTF(100000) tf = hashingTF.transform( data.map(lambda x: x[0].encode('utf-8', 'ignore'))) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() prediction = nbModel.predict(tfidf) temp = [] i = 0 for p, q, r in data.collect(): temp.append([]) temp[i].append(p.encode('utf-8', 'ignore')) temp[i].append(q) temp[i].append(r) i += 1 i = 0 for p in prediction.collect(): temp[i].append(p) i += 1 print(temp) for i in temp: insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2])) else: print("Empty RDD !!!") pass
def TFIDF(source, destination): if destination[-1] != '/': destination = destination + '/' ## typically define the source message rdd = sc.wholeTextFiles(source).map(lambda (name, text): text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d" % ind + ".txt" ind = ind + 1 file = open(dest_path, 'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination + "TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0, 100): print "" #Testing Printing" except KeyboardInterrupt: pass
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) words = wordsData.select("words").rdd.map(lambda x: x.words) hashingTF = MllibHashingTF(numFeatures) tf = hashingTF.transform(words) tf.cache() idf = MllibIDF().fit(tf) tfidf = idf.transform(tf) # @TODO make this nicer tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns)) with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns) tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns)) return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def generatedHashedFeatures(tweet): #get label from tweet #get text from tweet htf = HashingTF(50000) lp = LabeledPoint("0", htf.transform(text)) return lp
def main(): #Reading the json file reviews_data = sqlContext.read.json(input) reviews=reviews_data.select('reviewText') reviews_rdd=reviews.rdd.cache() rdd_data=reviews_rdd.map(lambda line:str(line.reviewText)) transformed_data=rdd_data.map(transform_data) #Finding Tf-IDF representation hashingTF = HashingTF() tf = hashingTF.transform(transformed_data) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf).collect() # Normalization # tfidf = idf.transform(tf) # normalizer1 = Normalizer() # normalized_vector=normalizer1.transform(tfidf).collect() score_rdd=reviews_data.rdd.map(lambda line:str(line.overall)).cache().collect() dates_rdd=reviews_data.rdd.map(lambda line:str(line.reviewTime)).map(lambda line:line.split(", ")).map(lambda (a,b):b).cache().collect() combinedList=zip(tfidf,score_rdd,dates_rdd) combinedRDD=sc.parallelize(combinedList).cache() TrainRDD=combinedRDD.filter(lambda (x,y,z):z!='2014').map(lambda (x,y,z):(x,y)) TestRDD=combinedRDD.filter(lambda (x,y,z):z=='2014').map(lambda (x,y,z):(x,y)) #Saving test and training data TrainRDD.saveAsPickleFile(output+'/Train_data_unnormalized.pickle') TestRDD.saveAsPickleFile(output+'/Test_data_unnormalized.pickle')
def get_feature_vectors(sc, input_file, feature_dimensions): """Get feature vector from the lines in input_file_obj using TF/IDF. Returns: vectors RDD """ # Load documents (one per line). tweet_file = sc.textFile(input_file) input_text_rdd = tweet_file.map(lambda line: _tokenize(line)) input_text_rdd.cache() # The default feature dimension is 2^20; for a corpus with million # tweets recommended dimensions are 50000 or 100000. Use higher # dimensions for larger corpus of tweets. hashing_tf = HashingTF(feature_dimensions) tf = hashing_tf.transform(input_text_rdd) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() return input_text_rdd, tfidf
def TFIDF(source, destination): if destination[-1] != '/': destination=destination+'/' ## typically define the source message rdd=sc.wholeTextFiles(source).map(lambda (name,text): text.split()) tf=HashingTF() tfVectors=tf.transform(rdd).cache() a = tfVectors.collect() # Storing the TF values above in individual files, one per link ind = 0 for vector in a: dest_path = destination + "TF_%d"%ind + ".txt" ind = ind + 1 file = open(dest_path,'w') file.write(str(vector)) file.close() # Calculating IDF Values for each case. idf=IDF() idfModel=idf.fit(tfVectors) tfIdfVectors=idfModel.transform(tfVectors) # Writing TF-IDF values to a single file. file = open(destination+"TF-IDF.txt", 'w') file.write(str(tfIdfVectors.collect())) try: for i in range(0,100): print ""#Testing Printing" except KeyboardInterrupt: pass
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map( lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc, "tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def main(): conf = SparkConf().setAppName("twitterclassifier") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) tweets = ssc.socketTextStream("localhost", PORT) \ .map(lambda x: json.loads(x)) \ .filter(lambda x: 'text' in x) \ .map(lambda x: x['text'].encode('utf-8')) hasher = HashingTF(DIM) features = tweets.map(lambda x: (x, hasher.transform(featurize(x)))).cache() # We create a model with random clusters and specify the number of clusters to find # decay = 1: total memory; decay = 0: no memory model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0) model.trainOn(features.map(lambda x: x[1])) results = model.predictOnValues(features).cache() # Need a closure over i here. def print_group(i): results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' % (x[1], x[0])).pprint(3) for i in xrange(N): print_group(i) ssc.start() ssc.awaitTermination()
def tfidf(rdd_doc): hasingTF = HashingTF() trainTf = hasingTF.transform(rdd_doc) trainTf.cache() idf = IDF().fit(trainTf) trainTfidf = idf.transform(trainTf) trainTfidf.cache() return trainTfidf, lambda x: hasingTF.indexOf(x)
def transform(idf, article): """ transform article to a sparse vector """ token = tokenizing(article) hashingTF = HashingTF() tf_test = hashingTF.transform(token) return idf.transform(tf_test)
def get_tfidf_features(txt): hashingTF = HashingTF() tf = hashingTF.transform(txt) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def generate_gender_tf(twProfilesRdd, numFe): """ Generate Term Frequency tuple (gender,sparse vector) from rdd containing following tuples: (gender,(clean words tuple)) """ tf = HashingTF(numFeatures=numFe) return twProfilesRdd.map(lambda genderDescrTuple: (genderDict[ genderDescrTuple[0]], tf.transform(genderDescrTuple[1])))
def get_tfidf_features(txt_rdd): hashingTF = HashingTF() tf = hashingTF.transform(txt_rdd) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def create_bayes(self): """ 创建贝叶斯训练模型 """ if self._check_traning_exists(): return # 获取积极文本构造rdd positive_file = os.path.join(settings.DATA_DIR, '分类词库/positive.txt') positive_data = self.sc.textFile(positive_file) # 数据去重 positive_data = positive_data.distinct() positive_data = positive_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 获取消极文本构造rdd negative_file = os.path.join(settings.DATA_DIR, '分类词库/negative.txt') negative_data = self.sc.textFile(negative_file) negative_data = negative_data.distinct() negative_data = negative_data.map( lambda line: line.split('###')).filter(lambda line: len(line) == 2) # 合并训练集 all_data = negative_data.union(positive_data) all_data.repartition(1) # 评分已经提前进行处理只有-1与1 rate = all_data.map(lambda s: s[0]) document = all_data.map(lambda s: s[1]) words = document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) # 训练词频矩阵 hashingTF = HashingTF() tf = hashingTF.transform(words) # 计算TF-IDF矩阵 idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) tf.cache() # 生成训练集和测试集 zipped = rate.zip(tfidf) data = zipped.map(lambda line: LabeledPoint(line[0], line[1])) training, test = data.randomSplit([0.6, 0.4], seed=0) # 训练贝叶斯分类模型 NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p: (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() # 存储rdd words.repartition(1).saveAsTextFile(self.training_words_dir) # 贝叶斯分类模型以pickle存储 with open(self.NBmodel, 'w') as f: pickle.dump(NBmodel, f)
def tfidf(self, tokenizer): """ Get TFIDF matrix rdd with spark tfidf functions """ self._create_rdd(tokenizer) hashingTF = HashingTF() tf = hashingTF.transform(self.token_rdd) idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) return self.rdd, idf, tfidf
def tf_idf(sc,title_token): hashingTF = HashingTF(100) title_token = sc.parallelize(title_token) tf = hashingTF.transform(title_token) print tf, ' tf' idf = IDF().fit(tf) tfidf = idf.transform(tf) return tfidf
def vectorize(sc, rdd_words, size=0): ''' 使用TF将词语向量化 向量的维度需要设定的,默认为2^20 ''' if not size: size = rdd_words.flatMap(lambda x:x).distinct().count() + 10000 hashingTF = HashingTF(size) tf = hashingTF.transform(rdd_words) return tf
def mySpark(minFreq, keyWord): # text cleaning function def removePunctuation(text): res=text.lower().strip() res=re.sub("[^0-9a-zA-Z ]", "", res) return res.split(" ") # Function for printing each element in RDD def println(x): for i in x: print i # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf = conf) # Load documents content (one per line) + cleaning. rawData = sc.textFile("list_berita-30.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: removePunctuation(x[3])) # Get documents content without word mapping documentNames = fields.map(lambda x: x[3]) # TF processing hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # IDF & TF-IDF processing tf.cache() idf = IDF(minDocFreq=int(minFreq)).fit(tf) tfidf = idf.transform(tf) # Get keyword relevance with content and zip it keywordTF = hashingTF.transform(removePunctuation(keyWord)) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) # print result print "Best document for keywords is:" print zippedResults.max()
def mySpark(minFreq, keyWord): # text cleaning function def removePunctuation(text): res = text.lower().strip() res = re.sub("[^0-9a-zA-Z ]", "", res) return res.split(" ") # Function for printing each element in RDD def println(x): for i in x: print i # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf=conf) # Load documents content (one per line) + cleaning. rawData = sc.textFile("list_berita-30.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: removePunctuation(x[3])) # Get documents content without word mapping documentNames = fields.map(lambda x: x[3]) # TF processing hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # IDF & TF-IDF processing tf.cache() idf = IDF(minDocFreq=int(minFreq)).fit(tf) tfidf = idf.transform(tf) # Get keyword relevance with content and zip it keywordTF = hashingTF.transform(removePunctuation(keyWord)) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) # print result print "Best document for keywords is:" print zippedResults.max()
def extractKeywords_Train(self): documents = self.sc.textFile(self.trainingfile).map(lambda line: line.split(" ")[1:]) hashingTF = HashingTF() tf = hashingTF.transform(documents) tf.cache() idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) tfidfIgnore.saveAsTextFile("AAA")
def tf_idf_cal(words_rdd): hashingTF = HashingTF() tf = hashingTF.transform(words_rdd) idf = IDF().fit(tf) tfidf = idf.transform(tf).cache() tfidf_str = tfidf.map(lambda line: str(line)).cache() return tfidf_str
def test_binary_term_freqs(self): hashingTF = HashingTF(100).setBinary(True) doc = "a a b c c c".split(" ") n = hashingTF.numFeatures output = hashingTF.transform(doc).toArray() expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0, hashingTF.indexOf("b"): 1.0, hashingTF.indexOf("c"): 1.0}).toArray() for i in range(0, n): self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(output[i]))
def predictSentiment(tweetText): nbModel = bc_model.value hashingTF = HashingTF() tf = hashingTF.transform(tweetText) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) tfidf.cache() prediction = nbModel.predict(tfidf) print "Predictions for this window :" for i in range(0, prediction.count()): print prediction.collect()[i], tweetText.collect()[i]
def calcTfidf(doc, source): """ This method computes TF-IDF scores for the given document. While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: first to compute the IDF vector and second to scale the term frequencies by IDF. """ hashingTF = HashingTF(200000) tf = hashingTF.transform(doc) print "TF calculated for "+source.split('/')[-1] tf.cache() idf = IDF().fit(tf) ##idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) print "TF-IDF calculated for "+source.split('/')[-1] return hashingTF, tfidf
def run(): with pyspark.SparkContext('local', 'mapAndPartition') as sc: spam = sc.textFile('spam.txt') normal = sc.textFile('normal.txt') htf = HashingTF(numFeatures=10000) spamFeatures = spam.map(lambda email: htf.transform(email.split(' '))) normalFeatures = normal.map(lambda email: htf.transform(email.split(' '))) positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache() model = LogisticRegressionWithSGD.train(trainingData) posTest = htf.transform('D M G GET cheap stuff by sending money to ...'.split(' ')) negTest = htf.transform('Hi Dad, I started studying Spark the other ...'.split(' ')) print('Prediction for positive test example: {}'.format(model.predict(posTest))) print('Prediction for negative test example: {}'.format(model.predict(negTest)))
def vectorize(training): hashingTF = HashingTF() tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) idf_training = IDF().fit(tf_training) tfidf_training = idf_training.transform(tf_training) tfidf_idx = tfidf_training.zipWithIndex() training_idx = training.zipWithIndex() idx_training = training_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) joined_tfidf_training = idx_training.join(idx_tfidf) training_labeled = joined_tfidf_training.map(lambda tup: tup[1]) labeled_training_data = training_labeled.map(lambda k: LabeledPoint(k[0][0], k[1])) return labeled_training_data
def main(): sc = SparkContext(appName="BayesClassifer") htf = HashingTF(50000) data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/1.csv') data_cleaned = data.map(lambda line: line.split(",")) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map( lambda (label, text): LabeledPoint(label, htf.transform(text))) data_hashed.persist() # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine) #print data # Split data aproximately into training (60%) and test (40%) training, test = data_hashed.randomSplit([0.70, 0.30], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Save and load model model.save(sc, "/home/varshav/Desktop/Bangalore") sameModel = NaiveBayesModel.load(sc, "/home/varshav/Desktop/Bangalore") print "----------" print model.predict(htf.transform("posts jump in net profit")) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) predictionAndLabel1 = training.map( lambda p: (sameModel.predict(p.features), p.label)) prediction = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() #buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v == 1 ).count() # print buy_buy prediction1 = 1.0 * predictionAndLabel1.filter( lambda (x, v): x == v).count() / training.count() print prediction print prediction1 sc.stop()
def main(): # 初始化 SparkContext sc = spark_context(spark_master) # 加载数据 data = sc.textFile(hdfs_path) # 计算词频 documents = data.map(tokenize) hashingTF = HashingTF(2 << 10) tf = hashingTF.transform(documents) # 对文档词频进行索引 corpus = tf.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # 索引和词的映射 mapping = hashing_term_mapping(documents) mapping.cache() # 训练 LDA 模型 ldaModel = LDA.train(corpus, k=3) # 链接到 MongoDB from pymongo import MongoClient mongo_client = MongoClient(mongo_host) mongo_client.admin.authenticate(mongo_user, mongo_pass, mechanism="SCRAM-SHA-1") clear_mongodb(mongo_client) # 保存结果到 MongoDB topics = ldaModel.describeTopics(maxTermsPerTopic=10) for topic in range(3): doc = {} doc["name"] = "topic " + str(topic) doc["terms"] = [] for i in range(10): term_index = topics[topic][0][i] for term in mapping.lookup(term_index): doc["terms"].append([term.encode("utf8"), topics[topic][1][i]]) send_mongodb(mongo_client, doc)
def featurize(tweet_tuple): """ generate features for this tweet text returns: csv line with a the last field containing the feature vector for the tweet """ ID_FIELD_IDX = 0 CREATED_AT_IDX = 1 TIMESTAMP_MS = 2 LANG_FIELD_IDX = 3 LON_FIELD_IDX = 4 LAT_FIELD_IDX = 5 TEXT_IDX = 6 TWEET_IDX = 1 #split the tweet into components id, lang, text, lon, lat etc tweet_attrib_list = tweet_tuple[TWEET_IDX].split(",") #get the text text = tweet_attrib_list[TEXT_IDX] #tokenize the text word_list = tokenize(text) #remove stop words word_list = removeStopWords(word_list) #remove punctuations word_list = removePunctuation(word_list) #stemmed the tokens word_list = stemmed_tokens(word_list) st = " ".join(word_list) #hash the words htf = HashingTF(50000) hashedfeatures = htf.transform(text) tweet = tweet_tuple[TWEET_IDX] results = {'tweet':tweet, 'features':hashedfeatures} return results
def createHashData(rdd): original = rdd.map(lambda line : line.split(", ")) # load up the json string data = rdd.map(lambda line : line.split(", ")).collect(); def fn(line): label = 0.0 if line[9] == 'title': label = 1.0 return (label, data[0:9]) # create paired data data_pared = original.map(fn) print data_pared htf = HashingTF(100) # hash data data_hashed = data_pared.map(lambda (label, f) : LabeledPoint(label, htf.transform(f))) return data_hashed
# Initialize a SparkContext sc = SparkContext() # Import full dataset of newsgroup posts as text file #data_raw = sc.textFile('hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcjsontxt') data_raw = sc.textFile('bbcdataset.json') # Parse JSON entries in dataset data = data_raw.map(lambda line: json.loads(line)) # Extract relevant fields in dataset -- category label and text content data_pared = data.map(lambda line: (line['label'], line['text'])) # Temporary print statement for testing partial script print data_pared.first() # Prepare text for analysis using our tokenize function to clean it up data_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text))) # Hashing term frequency vectorizer with 50k features htf = HashingTF(50000) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(hash(label), htf.transform(text))) # Ask Spark to persist the RDD so it won't have to be re-created later data_hashed.persist() # Train a Naive Bayes model on the training data model = NaiveBayes.train(data_hashed) #model.save(sc, "hdfs://ec2-54-213-237-76.us-west-2.compute.amazonaws.com:9000/trainingdata/trainingdata/bbcmodela") model.save(sc, "bbcmodel")
def getTFVector(review): htf = HashingTF(1000) doc = review.split() return htf.transform(doc).toArray()
print x # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf = conf) # Load documents (one per line). rawData = sc.textFile("subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) documentNames = fields.map(lambda x: x[1]) hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) keywordTF = hashingTF.transform(["Apollo"]) keywordHashValue = int(keywordTF.indices[0]) keywordRelevance = tfidf.map(lambda x: x[keywordHashValue]) zippedResults = keywordRelevance.zip(documentNames) print "Best document for keywords is:" print zippedResults.max()
from pyspark import SparkContext from pyspark.mllib.clustering import KMeans, KMeansModel from pyspark.mllib.feature import HashingTF, IDF sc = SparkContext() rdd = sc.wholeTextFiles("/usr/local/Cellar/BigDataAdvanced/Assignment1/TwitterStuff/TweetData").map(lambda (name,text):text.split()) tf = HashingTF() tfVectors = tf.transform(rdd).cache() a = tfVectors.collect() count = 0 for vec in a: print vec count = count + 1 with open("TF_Tweet"+str(count)+".txt","w") as f: f.write(str(vec)) f.close() idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) file = open("TF-IDF_tweet.txt", 'w') file.write(str(tfIdfVectors.collect())) #count = 0 #output=tfIdfVectors.collect() #for vec in output: # print vec # count = count + 1 # with open("TF_Wiki"+str(count)+".txt","w") as f: # f.write(str(vec))
# collectVocab = vocab.collect() # remove top 3 lines from document doc_wo_counters = documents.mapPartitionsWithIndex(lambda i, iter: islice(iter, 3, None) if i == 0 else iter) final_doc = doc_wo_counters.map(lambda x: (int(x[0]), doc_to_words(int(x[1]), int(x[2])).encode("utf8"))).reduceByKey(lambda x, y: x + " " + y) vect_rep = final_doc.map(lambda x: x[1]) raw_document = sc.textFile("test.txt") vect_rep = raw_document.map(lambda line: line.encode("utf8").split(" ")) # TfIDF hashingTF = HashingTF() tf = hashingTF.transform(vect_rep) tf.cache() idf = IDF().fit(tf) tfidf_vectors = idf.transform(tf) #Build the model (cluster the data) clusters = KMeans.train(tfidf_vectors, 10, maxIterations=100) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point.toArray() - center)])) WSSSE = tfidf_vectors.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE))
#Exports data to csv train.to_csv("product_train.csv",",",index=False) #all.to_csv("product_all.csv",",",index=False) # Read the training data file created above into an RDD train = sc.textFile( "product_train.csv" ).map(lambda line: (line.split(','))) header = train.first() #extract header train2 = train.filter(lambda x:x !=header) train_title = sc.textFile( "product_train.csv" ).map(lambda line: (line.split(',')[1])) hashingTF = HashingTF(50000) tf = train_title.map(lambda title: hashingTF.transform(title.split(" "))) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) print(tfidf.first()) data_pared = train2.map(lambda line: (line[0], line[1])) data_pared2 = train2.map(lambda line: (line[0])) train_cleaned = data_pared.map(lambda (label, text): (label, tokenize(text))) #parsedData = train_cleaned.map(lambda (label,text): LabeledPoint(label, idf.transform(text))) parsedData = train_cleaned.map(lambda (label, text): LabeledPoint(label, hashingTF.transform(text))) # Split the data into two RDDs. 70% for training and 30% test data sets ( trainingData, testData ) = parsedData.randomSplit( [0.7, 0.3] )
from pyspark.mllib.linalg import SparseVector from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF import math dim= math.pow(2,16) hashingTF= HashingTF(dim) tokens = manytokens_final.map(lambda l:[k for (k,v) in l]) tf = hashingTF.transform(tokens) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) #print(tfidf.count()) #=11314
from pyspark import SparkContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF # Load documents (one per line). sc = SparkContext() documents = sc.textFile("training/bigdata_documents_cat.txt").map(lambda line: line.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) # ... continue from the previous example tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # save the matrix #tfidf.saveAsSequenceFile("training/matrix.txt") tfidfmatrix = tfidf.collect() count = str(tfidf.count()) givenIndex = 0 givenDocumentMatrix = tfidfmatrix[givenIndex] def similarity(x): return givenDocumentMatrix.dot(x) sim = tfidf.map(similarity) indexedsim = sim.zipWithIndex().map(lambda keyval: (keyval[1],keyval[0]))
from pyspark import SparkConf, SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF from pyspark.mllib.classification import LogisticRegressionWithSGD conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) spam = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/spam.txt") normal = sc.textFile("/home/sakib/spark-1.3.1/spark_workspace/data/ham.txt") # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures = 10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) normalFeatures = normal.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (normal) examples. positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) negativeExamples = normalFeatures.map(lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache() # Cache since Logistic Regression is an iterative algorithm. # Run Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Test on a positive example (spam) and a negative one (normal). We first apply # the same HashingTF feature transformation to get vectors, then apply the model. posTest = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) negTest = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) print "Prediction for positive test example: %g" % model.predict(posTest) print "Prediction for negative test example: %g" % model.predict(negTest)
# Regular expressions to find all the links and text from XML files and storing them in two lists TEXT_RE = re.compile(r'<text.+>([\s\S]*)<\/text>') liste = TEXT_RE.findall(text_content) str1 = re.split('[^a-zA-Z.]', liste[0].lower()) str2 = filter (None, str1) return str2 splitRDD = dataRDD.values().map(text_parsing) #Building tf-idf hashingTF = HashingTF() tf = hashingTF.transform(splitRDD) from pyspark.mllib.feature import IDF # ...from tf create IDF tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) zipped = splitRDD.zip(tfidf) fRDD = splitRDD.flatMap(lambda x: x).distinct() #print fRDD.count() wordRDD = fRDD.map(lambda x: (x, hashingTF.indexOf(x))) listW = wordRDD.collect()