def compTF(self, rdd): tf = HashingTF(150000) return tf.transform(rdd)
# coding=UTF-8 from pyspark.mllib.feature import HashingTF, IDF from pyspark import SparkContext sentence = "hello hello world" words = sentence.split() # 将句子切分为一串单词 tf = HashingTF(10000) # 创建一个向量,其尺寸S = 10,000 aa = tf.transform(words) print(aa) # 将若干文本文件读取为TF向量 sc = SparkContext('local') rdd = sc.wholeTextFiles('P51FeatureExtraction.py').map( lambda text: text[1].split()) tfVectors = tf.transform(rdd) # 对整个RDD进行转化操作 for v in tfVectors.collect(): print(v) # 在 Python 中使用 TF-IDF idf = IDF() idfModel = idf.fit(tfVectors) tfIdVectors = idfModel.transform(tfVectors) print(tfIdVectors) for v in tfIdVectors.collect(): print(v) # 在 Python 中缩放向量 print('--在 Python 中缩放向量--') from pyspark.mllib.linalg import Vectors from pyspark.mllib.feature import StandardScaler
def produce_tfidf(x): tf = HashingTF().transform(x) idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf) return tfidf
TschemaP = sqlContext.createDataFrame(people1, Tschema) TschemaPeople = TschemaP.withColumn("label", TschemaP["label"].cast(DoubleType())) regexTokenizer = RegexTokenizer(inputCol="FileContent", outputCol="words", pattern="\\W") stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(nltkstop) regexer = regexTokenizer.transform(TschemaPeople) stop = stopwordsRemover.transform(regexer) #tokenizer = Tokenizer(inputCol="filtered", outputCol="words") #wordsData = tokenizer.transform(stop) hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures") #hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(stop) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) seenData = idfModel.transform(featurizedData) (trainingData1, testData1) = seenData.randomSplit([0.6, 0.4], seed=100) lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData1) predictions1 = lrModel.transform(testData1) predictions1.select("FileContent","label","prediction") \
Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training data with label training = review_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1])) #train model classifier model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=8, maxBins=32) #save model to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/"+model_name model.save(sc, output_dir) end = time.time() print("Total Records : ", reviews.count(), " , Processing Time : ", (end - start))
def main(): #============================================================================== # Specify aws credentials #============================================================================== sc = SparkContext(conf=SparkConf().setAppName("Random Forest")) sqlContext = SQLContext(sc) sc._jsc.hadoopConfiguration().set('fs.s3n.awsAccessKeyId', sys.argv[5]) sc._jsc.hadoopConfiguration().set('fs.s3n.awsSecretAccessKey', sys.argv[6]) #============================================================================== # Specify file paths on s3 #============================================================================== bytePath = sys.argv[1] namePath = sys.argv[2] nameTestPath = sys.argv[3] classPath = sys.argv[4] #============================================================================== # Section 1: GETTING TF OF .BYTE FILES, O/P OF SECTION :(FILE NAME, TF) # # Clean all the byte files (removes /r/n and removes the initial number token on each line) # Generate tf of all the cleaned byte files(includes both training and testing byte files) #============================================================================== #O/P: (FILENAME,TEXT) docData = sc.wholeTextFiles( bytePath, 25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8"))) #O/P: (FILENAME, CLEANED TEXT) cleanDocData = docData.map(lambda (x, y): (x, clean(y.split()))) #O/P: Hashing TF arguement -> 256 + 1 (for "??") features x = 16**2 + 1 hashingTF = HashingTF(x) #O/P: (FILENAME,TF) tfDocData = cleanDocData.map(lambda (x, y): (x, hashingTF.transform(y))) #cache or persist the output of section 1 tfDocData.persist() print tfDocData.take(1) #============================================================================== # Section 2: GETTING LABELS OF TRAINING DATA , O/P OF SECTION:(FILE NAME,LABEL) OF TRAINING DATA #============================================================================== #O/P: (INDEX,FILENAME) nameData = sc.textFile( namePath, 25).map(lambda x: bytePath + "/" + x + ".bytes").zipWithIndex().map( lambda (x, y): (y, x)) #O/P: (INDEX,LABEL) labelData = sc.textFile( classPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1))) #O/P: (FILENAME,LABEL) joinNameLabel = nameData.join(labelData).map(lambda (x, y): y) #Cache/persist output of section 2 joinNameLabel.persist() #============================================================================== # Section 3: Join the tf of byte files with labels for analysis and convert into Labelled Point to feed it into classifier. # O/P of section: LabelledPoint(LABEL, TF) #============================================================================== #O/P: (LABEL,TF) joinCleanDocLabel = joinNameLabel.join(tfDocData).map(lambda (x, y): y) #O/P: LabelledPoint(LABEL,TF) hashData = joinCleanDocLabel.map(lambda (label, text): LabeledPoint(label, text)) #Persist/cache the output of section 3 hashData.persist() #============================================================================== # Section 4: Apply Random Forest Classifier and generate model on hashData using gini impurity. # Determined heurestically that 50 trees with depth of 8 give best accuracy. #============================================================================== model = RandomForest.trainClassifier(hashData, numClasses=9, categoricalFeaturesInfo={}, numTrees=50, featureSubsetStrategy="auto", impurity='gini', maxDepth=8, maxBins=32) #============================================================================== # Section 5: Generate test data in the format for prediction. O/P of section: (INDEX,(INDEX,FILENAME,TF)) #============================================================================== #O/P: (INDEX,FILENAME) nameTestData = sc.textFile( nameTestPath, 25).map(lambda x: bytePath + "/" + x + ".bytes").zipWithIndex() #O/P: (INDEX,FILENAME,TF) joinTestDocLabel = nameTestData.join(tfDocData).map(lambda (x, y): (x, y[0], y[1])) #O/P: (INDEX,(INDEX,FILENAME,TF)) joinTestDocLabel1 = joinTestDocLabel.zipWithIndex().map(lambda (x, y): (y, x)) #============================================================================== # Section 6: Prediction of Labels and saving the output in a RDD which is saved as text file on s3. #============================================================================== #O/P: Predictions prediction = model.predict(joinTestDocLabel1.map(lambda (x, (y, z, w)): w))
print x # Boilerplate Spark stuff: conf = SparkConf().setMaster("local[*]").setAppName("SparkTFIDF") sc = SparkContext(conf=conf) # Load documents (one per line). rawData = sc.textFile("F:/Matakuliah/Semester 6/Big Data/Berita.csv") fields = rawData.map(lambda x: x.split(";")) documents = fields.map(lambda x: x[4].split(" ")) documentId = fields.map(lambda x: x[0]) # Creating Hash table and TF table hashingTF = HashingTF(100000) tf = hashingTF.transform(documents) # Creating idf tf.cache() idf = IDF(minDocFreq=1).fit(tf) # Calculate TF/IDF tfidf = idf.transform(tf) # Keyword yang akan dicari diubah ke Hash value <- Hash table di atas keywordTF = hashingTF.transform(["pria"]) keywordHashValue = int(keywordTF.indices[0]) # Temukan relevansinya dengan tabel tf-idf yang sudah dibuat
obj1 = TweetPreProcessing() tweet = streamData.map(lambda x: x[1]) \ .map(decodeUnicode)\ .flatMap(obj1.TweetBuilder)\ #RETRIEVING TWEET's TEXT and LABEL # ZIPPING EACH TWEET WITH UNIQUE ID label = tweet.map(lambda tup: tup[0]) \ .transform(lambda x: x.zipWithUniqueId()) \ .map(lambda line: (line[1], int(line[0]))) # int() casting string 'label' to int text = tweet.map(lambda tup: tup[1]) #computing TF-IDF for each tweet and classifying it hashingTF = HashingTF(tf_val) tfidf_testing = text.map(lambda tup: hashingTF.transform(tup)) \ .transform(lambda tup: idf_training.transform(tup)) \ tweet_classified = tfidf_testing.map(lambda p: int(NBM.predict(p)))\ .transform(lambda p: p.zipWithUniqueId()) \ .map(lambda line: (line[1], line[0])) \ # .pprint() # Here the ground truth and the predicted class are joined # so, for each tweet we have the following structure: # (class_predicted, ground truth) i.e. (4,0),(0,0) result = label.join(tweet_classified) \ .map(lambda tup: tup[1]) \ .foreachRDD(jdbcInsert)
# $example on$ # Load documents (one per line). # documents = sc.textFile("*.txt").map(lambda line: line.split(" ")) documents = spark.read.text("*.txt") documents = documents.withColumn( "doc_id", F.row_number().over(Window.orderBy('value'))) documents.printSchema() # creating tokens/words from the sentence data tokenizer = Tokenizer(inputCol="value", outputCol="words") wordsData = tokenizer.transform(documents) # applying tf on the words data hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # calculating the IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # displaying the results rescaledData.select("doc_id", "features").show(truncate=False) # closing the spark session spark.stop() # hashingTF = HashingTF()
def main(sc, db, tracking_word): print('>' * 30 + 'SPARK START' + '>' * 30) hashingTF = HashingTF() iDF = IDF() # Initialize sparksql context # Will be used to query the trends from the result. sqlContext = SQLContext(sc) # Initialize spark streaming context with a batch interval of 10 sec, # The messages would accumulate for 10 seconds and then get processed. ssc = StreamingContext(sc, batch_interval) # Receive the tweets host = socket.gethostbyname(socket.gethostname()) # Create a DStream that represents streaming data from TCP source socket_stream = ssc.socketTextStream(STREAM_HOST, STREAM_PORT) lines = socket_stream.window(window_time) # Construct tables tmp = [('none', 0)] related_keywords_df = sqlContext.createDataFrame(tmp, ['Keyword', 'Count']) related_hashtags_df = sqlContext.createDataFrame(tmp, ['Hashtag', 'Count']) trans_table = str.maketrans('', '', ',.!?:;"@&()#.-\\/+') lines.map(lambda line: line.translate(trans_table).lower()) # 1) Count the number of tweets tweet_cnt_li = tweet_count(lines) # 2) Count the number of users user_cnt_li = user_count(lines) # 3) Find the related keywords related_keywords(lines) # 4) Find the related hashtags related_hashtags(lines) # 5) Sentiment analysis pos_cnt_li = sentiment_analysis(lines, hashingTF, iDF) ########################################################################### # Start the streaming process ssc.start() process_cnt = 0 start_time = [datetime.datetime.now()] # print("Here!!!", process_times, process_cnt) while process_cnt < process_times: time.sleep(window_time) start_time.append(datetime.datetime.now()) # Find the top related keywords if len(sqlContext.tables().filter( "tableName LIKE 'related_keywords_tmp'").collect()) == 1: top_words = sqlContext.sql( 'Select Keyword, Count from related_keywords_tmp') related_keywords_df = related_keywords_df.unionAll(top_words) related_keywords_tb = True else: related_keywords_tb = False # Find the top related hashtags if len(sqlContext.tables().filter( "tableName LIKE 'related_hashtags_tmp'").collect()) == 1: top_hashtags = sqlContext.sql( 'Select Hashtag, Count from related_hashtags_tmp') related_hashtags_df = related_hashtags_df.unionAll(top_hashtags) process_cnt += 1 # Final tables if related_keywords_tb: related_keywords_df = related_keywords_df.filter( related_keywords_df['Keyword'] != 'none') # Spark SQL to Pandas Dataframe related_keywords_pd = related_keywords_df.toPandas() related_keywords_pd = related_keywords_pd[ related_keywords_pd['Keyword'] != tracking_word] related_keywords_pd = related_keywords_pd.groupby( related_keywords_pd['Keyword']).sum() related_keywords_pd = pd.DataFrame(related_keywords_pd) related_keywords_pd = related_keywords_pd.sort_values( "Count", ascending=0).iloc[0:min(9, related_keywords_pd.shape[0])] # Spark SQL to Pandas Dataframe related_hashtags_pd = related_hashtags_df.toPandas() related_hashtags_pd = related_hashtags_pd[ related_hashtags_pd['Hashtag'] != '#' + tracking_word] related_hashtags_pd = related_hashtags_pd.groupby( related_hashtags_pd['Hashtag']).sum() related_hashtags_pd = pd.DataFrame(related_hashtags_pd) related_hashtags_pd = related_hashtags_pd.sort_values( "Count", ascending=0).iloc[0:min(9, related_hashtags_pd.shape[0])] ssc.stop() ########################################################################### print(">>>tweet_cnt_li:") print(tweet_cnt_li) print(">>>user_cnt_li:") user_cnt_len_li = len(user_cnt_li) print(user_cnt_len_li) print(">>>start_time:") print(start_time) print(">>>pos_cnt_li") print(pos_cnt_li) print(">>>related_keywords_tb") print(related_keywords_tb) # print(related_keywords_pd.head(10)) # print(related_hashtags_pd.head(10)) if related_keywords_tb: related_keywords_js = json.loads( related_keywords_pd.reset_index().to_json(orient='records')) else: related_keywords_js = None # print(related_keywords_js) related_hashtags_js = json.loads( related_hashtags_pd.reset_index().to_json(orient='records')) # print(related_hashtags_js) # Store the data to MongoDB data_to_db(db, start_time, tweet_cnt_li, user_cnt_len_li, related_keywords_js, related_hashtags_js, pos_cnt_li, tracking_word, related_keywords_tb) print('>' * 30 + 'SPARK STOP' + '>' * 30)
# Databricks notebook source exported at Thu, 23 Jun 2016 07:23:39 UTC from pyspark import SparkConf, SparkContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF rawData = sc.textFile( "/FileStore/tables/dp736dao1466664806758/subset_small-50f68.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) #Document names documentNames = fields.map(lambda x: x[1]) #hash the word in document to their term frequencies hashingtf = HashingTF(100000) #to save memory tf = hashingtf.transform( documents) # each value ->term frequency of unique hash value #calculating tf*idf score idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform( tf) # each value ->tf*idf of unique hash value of each document #Test gettysBurgTF = hashingtf.transform("Gettysburg") gettysburgHashValue = int(gettysBurgTF.indices[0]) gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue]) zippedResults = gettysburgRelevance.zip(documentNames) #print best result print zippedResults.max()
def import_data(self): # meta df meta_df = pd.read_csv(self.metadata_path, dtype={ 'pubmed_id': str, 'Microsoft Academic Paper ID': str, 'doi': str }) # json all_json = glob.glob(f"{self. DEFAULT_INPUT_PATH}/**/*.json", recursive=True) dict_ = {'paper_id': [], 'doi': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []} for idx, entry in enumerate(all_json): if idx % (len(all_json) // 10) == 0: print(f'Processing index: {idx} of {len(all_json)}') try: content = FileReader(entry) except Exception as e: continue # invalid paper format, skip # get metadata information meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] # no metadata, skip this paper if len(meta_data) == 0: continue dict_['abstract'].append(content.abstract) dict_['paper_id'].append(content.paper_id) dict_['body_text'].append(content.body_text) # also create a column for the summary of abstract to be used in a plot if len(content.abstract) == 0: # no abstract provided dict_['abstract_summary'].append("Not provided.") elif len(content.abstract.split(' ')) > 100: # abstract provided is too long for plot, take first 100 words append with ... info = content.abstract.split(' ')[:100] summary = self.get_breaks(' '.join(info), 40) dict_['abstract_summary'].append(summary + "...") else: # abstract is short enough summary = self.get_breaks(content.abstract, 40) dict_['abstract_summary'].append(summary) # get metadata information meta_data = meta_df.loc[meta_df['sha'] == content.paper_id] try: # if more than one author authors = meta_data['authors'].values[0].split(';') if len(authors) > 2: # if more than 2 authors, take them all with html tag breaks in between dict_['authors'].append(self.get_breaks('. '.join(authors), 40)) else: # authors will fit in plot dict_['authors'].append(". ".join(authors)) except Exception as e: # if only one author - or Null valie dict_['authors'].append(meta_data['authors'].values[0]) # add the title information, add breaks when needed try: title = self.get_breaks(meta_data['title'].values[0], 40) dict_['title'].append(title) # if title was not provided except Exception as e: dict_['title'].append(meta_data['title'].values[0]) # add the journal information dict_['journal'].append(meta_data['journal'].values[0]) # add doi dict_['doi'].append(meta_data['doi'].values[0]) df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary']) df_covid['abstract_word_count'] = df_covid['abstract'].apply( lambda x: len(x.strip().split())) # word count in abstract df_covid['body_word_count'] = df_covid['body_text'].apply( lambda x: len(x.strip().split())) # word count in body df_covid['body_unique_words'] = df_covid['body_text'].apply( lambda x: len(set(str(x).split()))) # number of unique words in body # remove duplicates df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True) df_covid['abstract'].describe(include='all') df_covid.dropna(inplace=True) # handle multiple languages # set seed DetectorFactory.seed = 0 # hold label - language languages = [] # go through each text for ii in tqdm(range(0, len(df_covid))): # split by space into list, take the first x intex, join with space text = df_covid.iloc[ii]['body_text'].split(" ") lang = "en" try: if len(text) > 50: lang = detect(" ".join(text[:50])) elif len(text) > 0: lang = detect(" ".join(text[:len(text)])) # ught... beginning of the document was not in a good format except Exception as e: all_words = set(text) try: lang = detect(" ".join(all_words)) # what!! :( let's see if we can find any text in abstract... except Exception as e: try: # let's try to label it through the abstract then lang = detect(df_covid.iloc[ii]['abstract_summary']) except Exception as e: lang = "unknown" pass # get the language languages.append(lang) languages_dict = {} for lang in set(languages): languages_dict[lang] = languages.count(lang) df_covid['language'] = languages # drop df_covid = df_covid[df_covid['language']=='en'] # change to spark # Enable Arrow-based columnar data transfers spark = SparkSession \ .builder \ .appName("PySparkKMeans") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") # Create a Spark DataFrame from a pandas DataFrame using Arrow df_english = spark.createDataFrame(df_covid) clean_text_df = df_english.withColumn("text", self.clean_text(col("body_text"))) tokenizer = Tokenizer(inputCol="text", outputCol="vector") vector_df = tokenizer.transform(clean_text_df) # remove stopwords punctuations = string.punctuation stopwords = list(STOP_WORDS) stopwords[:10] custom_stop_words = [ 'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'elsevier', 'pmc', 'czi', 'www', "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ] for w in custom_stop_words: if w not in stopwords: stopwords.append(w) # Define a list of stop words or use default list remover = StopWordsRemover(stopWords=stopwords) # Specify input/output columns remover.setInputCol("vector") remover.setOutputCol("vector_no_stopw") # Transform existing dataframe with the StopWordsRemover vector_no_stopw_df = remover.transform(vector_df) # tdidf hashingTF = HashingTF() tf = hashingTF.transform(vector_no_stopw_df.select("vector_no_stopw")) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # PCA mat = RowMatrix(tfidf) # Compute the top 4 principal components. # Principal components are stored in a local dense matrix. pc = mat.computePrincipalComponents(1325) # Project the rows to the linear space spanned by the top 4 principal components. projected = mat.multiply(pc) projected.toPandas().to_csv(f"{self.DEFAULT_OUTPUT_FILE}") return projected
def main(): #============================================================================== # Specifying paths #============================================================================== sc = SparkContext(conf=SparkConf().setAppName("Random Forest")) sqlContext = SQLContext(sc) bytePath = "/Users/priyanka/Desktop/project2files/train" byteTestPath = "/Users/priyanka/Desktop/project2files/test" namePath = "/Users/priyanka/Desktop/X_train_small.txt" nameTestPath = "/Users/priyanka/Desktop/X_test_small.txt" classPath = "/Users/priyanka/Desktop/y_train_small.txt" classTestPath = "/Users/priyanka/Desktop/y_test_small.txt" #============================================================================== # SECTION 1: GETTING TF OF .BYTE FILES, O/P OF SECTION :(FILE NAME, TF) #============================================================================== #O/P :(FILE NAME,TEXT) docData = sc.wholeTextFiles( bytePath, 25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8"))) print("docData done") docData.take(1) #clean docData here - remove 1st word from line and remove /r/n #O/P: (FILE NAME, CLEAN TEXT) cleanDocData = docData.map(lambda (x, y): (x, clean(y.split()))) x = 16**2 + 1 hashingTF = HashingTF(x) #O/P: (FILE NAME, TF) tfDocData = cleanDocData.map(lambda (x, y): (x, hashingTF.transform(y))) tfDocData.take(1) #============================================================================== # Section 2:GETTING LABELS , O/P OF SECTION:(FILE NAME,LABEL) #============================================================================== #Output format : (INDEX, FILE NAME) nameData = sc.textFile( namePath, 25).map(lambda x: "file:" + bytePath + "/" + x + ".bytes" ).zipWithIndex().map(lambda (x, y): (y, x)) #O/P:(INDEX,LABEL) labelData = sc.textFile( classPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1))) #O/P :(FILE NAME,LABEL) joinNameLabel = nameData.join(labelData).map(lambda (x, y): y) #============================================================================== # Section 3:Get Data and generate Labelled Point O/P: (LABEL,TF) #============================================================================== #O/P:(LABEL,TF) joinCleanDocLabel = joinNameLabel.join(tfDocData).map(lambda (x, y): y) #O/P: Labelled Point(LABEL,TF) hashData = joinCleanDocLabel.map(lambda (label, text): LabeledPoint(label, text)) #============================================================================== # Section 4: Build Classification Model: # Applying Random Forest Classification Model on training data #============================================================================== #model = RandomForest.trainClassifier(hashData, numClasses=9, categoricalFeaturesInfo={},numTrees=50, featureSubsetStrategy="auto",impurity='gini', maxDepth=8, maxBins=32) model = NaiveBayes.train(hashData) #============================================================================== # Section 5: TEST : GETTING TF OF .BYTE FILES, O/P OF SECTION :(FILE NAME, TF) #============================================================================== docTestData = sc.wholeTextFiles( byteTestPath, 25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8"))) #docTestData.take(1) cleanDocTestData = docTestData.map(lambda (x, y): (x, clean(y.split()))) tfDocTestData = cleanDocTestData.map(lambda (x, y): (x, hashingTF.transform(y))) #============================================================================== # Section 6: TEST: GETTING LABELS , O/P OF SECTION:(FILE NAME,LABEL) #============================================================================== nameTestData = sc.textFile( nameTestPath, 25).map(lambda x: "file:" + byteTestPath + "/" + x + ".bytes" ).zipWithIndex().map(lambda (x, y): (y, x)) labelTestData = sc.textFile( classTestPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1))) joinTestNameLabel = nameTestData.join(labelTestData).map(lambda (x, y): y) #============================================================================== # Section 7: TEST: o/p: (FILE,) #============================================================================== #O/P:(FILE,LABEL,TF) joinTestDocLabel = joinTestNameLabel.join(tfDocTestData).map( lambda (x, y): (x, y[0], y[1])) #O/P:(INDEX,(FILE,LABEL,TF)) joinTestDocLabel1 = joinTestDocLabel.zipWithIndex().map(lambda (x, y): (y, x)) #Predict on Sparse vector tf prediction = model.predict(joinTestDocLabel1.map(lambda (x, (y, z, w)): w))
else: return '' else: return "" data = data.map(combine).filter(lambda x:len(x)>0) # 返回形如 [index, conten, title],完成数据预处理 ##################################################### ################# Step 2 TF-IDF ##################### ##################################################### from pyspark.mllib.feature import HashingTF,IDF from pyspark.mllib.feature import HashingTF,IDF # TF部分 tf = HashingTF(50000) # 取50000维 vectors = data.map(lambda line:(line[0],line[2],tf.transform(line[1]))) # 形如 [index, title, tf] # IDF部分 vec = vectors.map(lambda line: line[2]) # 只留下tf结果 idf = IDF() idfmodel = idf.fit(vec) tfIdfVectors = idfmodel.transform(vec) # 获得tf-idf结果 tfIdfVectors.cache() # 使tf-idf结果可持久化,完成tf-idf步骤 ##################################################### ################## Step 3 SVD ####################### ##################################################### # 进计算VD from pyspark.mllib.linalg.distributed import RowMatrix
spark = SparkSession(sc) df = spark.read.csv('hdfs://192.168.100.6:9000/user/ubuntu/Dataset75.csv', header=True) data = df.rdd.map(list) print(data.first()) score = data.map(lambda s: 1.0 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0) comment = data.map(lambda s: s[3]) print(score.count()) print(comment.count()) tf = HashingTF() tfVectors = tf.transform(comment).cache() idf = IDF() idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors) #print(tfIdfVectors.take(3)) #需要用 RDD 的 zip 算子将这两部分数据连接起来,并将其转化为分类模型里的 LabeledPoint 类型 zip_score_comment = score.zip(tfIdfVectors) final_data = zip_score_comment.map(lambda line: LabeledPoint(line[0], line[1])) train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=0) print(train_data.take(1)) time_start = time.time() #SVMModel = SVMWithSGD.train(train_data,iterations=100)
def main(sc): data = sc.textFile('data/train.txt').map(parseLine) #print(data.take(10)) # Train/Test split training, test = data.randomSplit([0.7, 0.3], seed=0) # TF-IDF # TF # Features will be hashed to indexes # And the feature(term) frequencies will be calculated hashingTF = HashingTF() # For each training example tf_training = training.map(lambda tup: hashingTF.transform(tup[1])) # IDF # Compute the IDF vector idf_training = IDF().fit(tf_training) # Scale the TF by IDF tfidf_training = idf_training.transform(tf_training) # (SparseVector(1048576, {110670: 1.5533, ...), 0) tfidf_idx = tfidf_training.zipWithIndex() # (['The', 'Da', 'Vinci', 'Code', 'book', 'is', 'just', 'awesome.'], 0) training_idx = training.zipWithIndex() # Reverse the index and the SparseVector idx_training = training_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) #print(idx_training.take(10)) # rdd.join: (K,V).join(K,W) -> (K, (V,W)) # idx_tfidf has no info about lables(0/1) # but idx_training has joined_tfidf_training = idx_training.join(idx_tfidf) training_labeled = joined_tfidf_training.map(lambda tup: tup[1]) training_labeled = training_labeled.map( lambda x: LabeledPoint(x[0][0], x[1])) #print(training_labeled.take(10)) # Train a naive Bayes model model = NaiveBayes.train(training_labeled, 1.0) # Test the model tf_test = test.map(lambda tup: hashingTF.transform(tup[1])) idf_test = IDF().fit(tf_test) tfidf_test = idf_test.transform(tf_test) tfidf_idx = tfidf_test.zipWithIndex() test_idx = test.zipWithIndex() idx_test = test_idx.map(lambda line: (line[1], line[0])) idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0])) joined_tfidf_test = idx_test.join(idx_tfidf) test_labeled = joined_tfidf_test.map(lambda tup: tup[1]) labeled_test_data = test_labeled.map(lambda k: LabeledPoint(k[0][0], k[1])) #print(labeled_test_data.take(2)) # Apply the trained model on Test data predictionAndLabel = labeled_test_data.map( lambda p: (model.predict(p.features), p.label)) #print(predictionAndLabel.take(10)) # Calculate the accuracy accuracy = 1.0 * predictionAndLabel.filter( lambda x: x[0] == x[1]).count() / labeled_test_data.count() print('>>> Accuracy') print(accuracy) #model.save(sc, '/model') output = open('src/model/model.ml', 'wb') pickle.dump(model, output)
from pyspark.mllib.feature import HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark import SparkContext, SparkConf conf = SparkConf().setMaster("local[*]").setAppName("Naive_Bayes") sc = SparkContext(conf=conf) print "Running Spark Version %s" % (sc.version) #word to vector space converter, limit to 10000 words htf = HashingTF(10000) #let 1 - positive class, 0 - negative class #tokenize sentences and transform them into vector space model positiveData = sc.textFile("Positive.txt") posdata = positiveData.map(lambda text: LabeledPoint( 1, htf.transform( text.replace(',', '').replace('.', '').replace('-', '').replace( '?', '').replace('!', ' ').lower().split(" ")))) print "No. of Positive Sentences: " + str(posdata.count()) posdata.persist() negativeData = sc.textFile("Negative.txt") negdata = negativeData.map(lambda text: LabeledPoint( 0, htf.transform( text.replace(',', '').replace('.', '').replace('-', '').replace( '?', '').replace('!', ' ').lower().split(" ")))) print "No. of Negative Sentences: " + str(negdata.count())
from pyspark.mllib.feature import IDF # Sparkの設定 conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf=conf) # 文章の読み込み rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # 文章の名前を取り出す documentNames = fields.map(lambda x: x[1]) # TFの計算。各文章の単語はハッシュ値として数値化される。 hashingTF = HashingTF(100000) # ハッシュ値の数の上限 tf = hashingTF.transform(documents) # IDFの計算 tf.cache() idf = IDF(minDocFreq=2).fit(tf) # 各文章における各単語のTF*IDFを計算する tfidf = idf.transform(tf) # 既にRDDをsparseベクトル(https://spark.apache.org/docs/latest/mllib-data-types.html)として得ています。 # 各TFxIDFの値は、各文章ごとにハッシュ値と関連付けて格納されています。 # 今回の元のデータに"Abraham Lincoln"の記事が含まれているので、 "Gettysburg"(リンカーンが有名なスピーチを行った場所)の単語で検索を行ってみましょう。 # "Gettysburg"のハッシュ値を取得します。
file.write("- Positive : " + str(num_pos_entropy) + "\n") file.write("- Negative : " + str(num_neg_entropy) + "\n") ########################################################################### ######### Testing on Brexit Labeled Data ######### print("\n========= Test on Brexit labeled data ========= ") text_negative_brexit = sc.textFile("data/brexit_negatif_clean.csv") text_positive_brexit = sc.textFile("data/brexit_positif_clean.csv") test_text_brexit = text_negative_brexit.union(text_positive_brexit) test_tlabels_brexit = text_negative_brexit.map(lambda x: 0.0).union( text_positive_brexit.map(lambda x: 1.0)) tf_test_brexit = HashingTF(numFeatures=100000).transform( test_text_brexit.map(lambda x: x)) tfidf_test_brexit = idf.transform(tf_test_brexit) #decision tree entropy labeled_prediction_entropy = test_tlabels_brexit.zip( model_decision_tree_entropy.predict(tfidf_test_brexit)).map( lambda x: { "actual": x[0], "predicted": x[1] }) accuracy_entropy = 1.0 * labeled_prediction_entropy.filter( lambda doc: doc["actual"] == doc['predicted']).count( ) / labeled_prediction_entropy.count() print('\n== ACCURACY DT ENTROPY : ', accuracy_entropy, '==')
from pyspark.mllib.feature import IDF # Boilerplate Spark stuff: conf = SparkConf().setMaster("local").setAppName("SparkTFIDF") sc = SparkContext(conf=conf) # Loading documents (one per line). rawData = sc.textFile("C:/Users/shatak/Desktop/shatak3rd/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) # Store the document names for later: documentNames = fields.map(lambda x: x[1]) #hashing the words in each document to their term frequencies: hashingTF = HashingTF(100000) #100K hash buckets just to save some memory tf = hashingTF.transform(documents) # we have an RDD of sparse vectors representing each document, # computing the TF*IDF of each term in each document: tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) # we have an RDD of sparse vectors, where each value is the TFxIDF # of each unique hash value for each document. # the article for "Abraham Lincoln" is in our data # set, so let's search for "Gettysburg" (Lincoln famous speech there):
) #print(hashtag_counts_df.show()) analysis_type = 'hashtag_analysis' send_df_to_dashboard(hashtag_counts_df, analysis_type) except: e = sys.exc_info()[0] print("There is an error: %s" % e) conf = SparkConf() conf.setAppName("TwitterStreamApp") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 3) ssc.checkpoint("checkpoint_models") htf = HashingTF(50000) NB_output_dir = '/spark/NaiveBayes' NB_load_model = NaiveBayesModel.load(sc, NB_output_dir) # Sentiment Analysis # ## 01 read tweets from stream ## dataStream = ssc.socketTextStream("localhost", 9009) ## 02 split the text into words # words = dataStream.map(lambda x: x.split(" ")) ## 03 transformed the words into features ## features = words.map(lambda x: htf.transform(x)) ## 04 predict the sentiment ## prediction = features.map(lambda x: classify(x)) ## 05 label the sentiments ## label_sentiments = prediction.map(lambda x: ('positive', 1)
def get_tfidf(idf, sentence: List[str]) -> SparseVector: tf = HashingTF().transform(sentence) return idf.transform(tf)
g = Goose() article = g.extract(url=url) a = article.cleaned_text html_dict = [] tokenhtml = tokenize(a) print(tokenhtml) for i in range(0, len(tokenhtml)): body = '' body += tokenhtml[i] + ' ' html_dict.append({"label": "0", "text": body}) sc = SparkContext() htmldata = sc.parallelize(html_dict) labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning=True) tf = HashingTF().transform( htmldata.map(lambda doc: doc["text"], preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) end_tfidf = datetime.now() tfidf_time = format(end_tfidf - start_tfidf) dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])) sameModel = NaiveBayesModel.load( sc, "/Users/apple/Dropbox/2016Spring/COSC526/MacHW1/mymodel") start_predict = datetime.now() predictionAndLabel = dataset.map(lambda p: (sameModel.predict(p.features))) predict_time = format(end_predict - start_predict) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / dataset.count()
def _compute_idf(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts) tf.cache() idf = IDF().fit(tf) return idf
from pyspark import SparkConf, SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import HashingTF from pyspark.mllib.classification import LogisticRegressionWithSGD if __name__ == '__main__': conf = SparkConf().setMaster("local").setAppName("spamClassify") sc = SparkContext(conf=conf) spam = sc.textFile("input/spam.txt") normal = sc.textFile("input/normal.txt") # Create a HashingTF instance to map email text to vectors of 10,000 features. tf = HashingTF(numFeatures=10000) # Each email is split into words, and each word is mapped to one feature. spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) normalFeatures = normal.map(lambda email: tf.transform(email.split(" "))) # Create LabeledPoint datasets for positive (spam) and negative (normal) examples. positiveExamples = spamFeatures.map( lambda features: LabeledPoint(1, features)) negativeExamples = normalFeatures.map( lambda features: LabeledPoint(0, features)) trainingData = positiveExamples.union(negativeExamples) trainingData.cache( ) # Cache since Logistic Regression is an iterative algorithm. # Run Logistic Regression using the SGD algorithm. model = LogisticRegressionWithSGD.train(trainingData) # Test on a positive example (spam) and a negative one (normal). We first apply # the same HashingTF feature transformation to get vectors, then apply the model. posTest = tf.transform( "O M G GET cheap stuff by sending money to ...".split(" "))
def get_tfidf(self, text_str) -> SparseVector: tf = HashingTF().transform(Text(text_str).process().words) return self.idf.transform(tf)
# END OF IMPORTS #################################################### # TODO: Change from random Split to normal split and CV # TODO: try DT and SVM algorithms # TODO: run ready libraries and compare results # START OF GLOBAL VARIABLES #################################################### PUNCTUATION = [i for i in string.punctuation] STOPWORDS = set(stopwords.words('english')) HEADER = [] PS = PorterStemmer() conf = SparkConf().setAppName("myFirstApp").setMaster("local") SC = SparkContext(conf=conf) HTF = HashingTF(50000) # END OF GLOBAL VARIABLES #################################################### # Fix tweet if split by comma def line_fixer(line, col_count): if len(line) > col_count: return line[:col_count - 1] + [",".join(line[col_count - 1:])] return line # Ignore unwanted columns like the tweet ID (column 0) and SentimentSource (column 2) def remove_unwanted_col(line, sentinement_index, sentinement_text_index): return line[sentinement_index], line[sentinement_text_index]
def labeled_points(rdd,n,label): rdd_list=rdd.map(lambda x: x.split()) hTF=HashingTF(n) rdd_h=rdd_list.map(lambda x: hTF.transform(x)) rdd_h_l=rdd_h.map(lambda x: LabeledPoint(label,x)) return rdd_h_l
#import sys #import pyspark from pyspark.mllib.feature import HashingTF, IDF #from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf #from pyspark.sql.functions import explode sc = SparkContext("local", "Simple App") # Load documents (one per line). documents = sc.textFile("reviews.txt").map(lambda line: line.split(" ")) hashingTF = HashingTF() tf = hashingTF.transform(documents) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idfIgnore = IDF(minDocFreq=2).fit(tf) tfidfIgnore = idfIgnore.transform(tf) # save tf-idf tfidfIgnore.saveAsTextFile('tfidf')
def getFeatures(data): hashingTF = HashingTF() tfData = data.map(lambda tup: hashingTF.transform(tup)) idfData = IDF().fit(tfData) tfidfData = idfData.transform(tfData) return tfidfData