def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def main(): spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([StructField("label", IntegerType(), True), StructField("title", StringType(), True), StructField("abstract", StringType(), True)]) # Download the data from S3 into two separate Dataframes traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'train.csv')), header=False, schema=schema, encoding='UTF-8') validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'test.csv')), header=False, schema=schema, encoding='UTF-8') # Tokenize the abstract column which contains the input text tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract") # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = tokenizer.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_train_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Similar data processing for validation dataset. transformed_validation = tokenizer.transform(validationdf) transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_validation_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def token(dataframe, in_col, out_col): tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col) dataframe = tokenizer.transform(dataframe) dataframe.printSchema() return dataframe
def run_tf_idf_spark_ml(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) return idfModel.transform(featurizedData)
def predictLabel(label,title,model): """预测新闻的标签""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) return myprediction
def create_features(raw_data): #Create DataFrame data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2]))) #Transform sentence into words tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_df = tokenizer.transform(data_df) #Calculate term frequency hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5) featurized_df = hashingTF.transform(words_df) #Calculate inverse document frequency idf = IDF(inputCol='rawFeatures', outputCol='features') idfModel = idf.fit(featurized_df) return idfModel.transform(featurized_df)
def preprocessing_titles(path,name): query = preprocessData(path) tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title") wordsData = tokenizer.transform(query) #after Stopword removal remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered") wordsData= remover.transform(wordsData) df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"]) df.registerTempTable("indices") wordsData.registerTempTable("words") qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id") if name!='': exportOnS3(qr,"s3a://redit-preprocessed/",name) qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) words = wordsData.select("words").rdd.map(lambda x: x.words) hashingTF = MllibHashingTF(numFeatures) tf = hashingTF.transform(words) tf.cache() idf = MllibIDF().fit(tf) tfidf = idf.transform(tf) # @TODO make this nicer tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns)) with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns) tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns)) return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("TokenizerExample").getOrCreate() # $example on$ sentenceDataFrame = spark.createDataFrame( [(0, "Hi I heard about Spark"), (1, "I wish Java 12 2 could use case classes"), (2, "Logistic,regression,models,are,neat")], ["id", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words")\ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) print("Apa yang disini") # $example off$ spark.stop()
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("TokenizerExample")\ .getOrCreate() # $example on$ sentenceDataFrame = spark.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) tokenized = tokenizer.transform(sentenceDataFrame) for words_label in tokenized.select("words", "label").take(3): print(words_label) regexTokenized = regexTokenizer.transform(sentenceDataFrame) for words_label in regexTokenized.select("words", "label").take(3): print(words_label) # $example off$ spark.stop()
(20,"apple iphone 6 tmobile 16gb"), (20,"Apple iPhone 6 (T Mobile) 16GB"), (20,"apple iphone 6 16gb t mobile"), (20,"Apple iPhone Apple iPhone 6 16GB 412 2 cell 2895"), (20,"iPhone 6 T Mobile 16 GB"), (20,"Apple 6 16gb T Mobile") ], ["label","text"]) # Learn a mapping from words to Vectors. #word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="textVec") #model = word2Vec.fit(documentDF) #result = model.transform(documentDF) #print result.take(2) tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText") tokenizedTextData = tokenizer.transform(documentDF) hashingTF = HashingTF(inputCol="tokenizedText", outputCol="rawFeatures") featurizedData = hashingTF.transform(tokenizedTextData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) result1 = idfModel.transform(featurizedData) for features_label in result.select("label","pcaFeatures").take(10): print(features_label) wordsvectors = result["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
# COMMAND ---------- from pyspark.ml.feature import OneHotEncoder, StringIndexer lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd") colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color")) ohe = OneHotEncoder().setInputCol("colorInd") ohe.transform(colorLab).show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ----------
tokenizer = Tokenizer(inputCol='text', outputCol='token_text') stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token') count_vec = CountVectorizer(inputCol='stop_token', outputCol='c_vec') idf = IDF(inputCol='c_vec', outputCol='tf_idf') # We convert 'spam' and 'ham' into numeric features (eros and ones) classes_to_numeric = StringIndexer(inputCol='class', outputCol='label') from pyspark.ml.feature import VectorAssembler data_features = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') data = data.replace(['spam', 'ham'], ['1', '0']) data = data.withColumn('class_num', data['class'].cast('float')) data_1 = tokenizer.transform(data) data_1 = stop_remove.transform(data_1) data_1 = count_vec.fit(data_1).transform(data_1) data_1 = idf.fit(data_1).transform(data_1) data_1 = data_features.transform(data_1) data_final = data_1.select('class', 'class_num', 'features') print('Final data set up'.upper()) data_final.show() train_data, test_data = data_final.randomSplit([0.7, 0.3]) # Whatever ml classification model can be used here from pyspark.ml.classification import LogisticRegression log_reg = LogisticRegression(labelCol='class_num', featuresCol='features') spam_detector = log_reg.fit(train_data) test_results = spam_detector.transform(test_data)
# -*- coding: utf-8 -*- from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row, DataFrame from pyspark.ml.feature import HashingTF, IDF, Tokenizer conf = SparkConf().setAppName("tfidf").setMaster("spark://HP-Pavilion:7077") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) dfTitles = sqlContext.read.parquet("roll_news_sina_com_cn.parquet") print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) sc.stop()
info = lines.filter(lambda line: ' : ' in line or ' ----- ' in line).map( log_to_row) df = spark.createDataFrame( info, "date_str string, thread_num int, operation int, level string, context string" ) ####################################################################### def words_padding(x): return x.strftime("%b %d %H") tokenizer = Tokenizer(inputCol="context", outputCol="context_words") df = tokenizer.transform(df) # indexer = StringIndexer(inputCol="context_words", outputCol="context_words_label") # indexed = indexer.fit(df).transform(df) ##################################################################### # 对日志内容进行编码,目前未使用,发现fp growth不用编码也可以使用 def context_process(x): return x.split(' ')[0] # dataframe的映射需要用withColumn或select + udf,select 需要重命名 context_process = udf(context_process, StringType()) df = df.withColumn('one_context', context_process(df['context']))
from pyspark.sql import SQLContext from pyspark.sql.functions import desc, explode from pyspark.sql.types import * from storage import Sqlite PARTITIONS = 500 THRESHOLD = 50 if __name__ == "__main__": conf = SparkConf().setAppName("reddit") conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf.set('spark.local.dir', '/mnt/work') conf.set('spark.driver.maxResultSize', '12g') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) fields = [StructField("subreddit", StringType(), True), StructField("body", StringType(), True)] rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields)) # split comments into words tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(rawDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(wordsDataFrame) # explode terms into individual rows termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")]) # group by subreddit and term, then count occurence of term in subreddits countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count() db = Sqlite() countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
from __future__ import print_function from pyspark.ml.feature import HashingTF, IDF, Tokenizer, NGram from pyspark.sql import SparkSession # Create the Spark session spark = SparkSession.builder.appName("Ngrams").getOrCreate() # Create the dataframe with five text abstracts abstracts = spark.read.text('abs*.txt') # Tokenize the abstract texts tokenizer = Tokenizer(inputCol="value", outputCol="words") wordsData = tokenizer.transform(abstracts) # Creating n-grams with n=5 ngram = NGram(n=5, inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordsData) # Apply topic frequency on the abstracts hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=40) featurizedData = hashingTF.transform(ngramDataFrame) # Calculate the inverse document frequency idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # Display the results rescaledData.select("features").show(20, truncate=False)
.builder \ .appName("ml_classification") \ .getOrCreate() ########################################################################### ######### Tokenizing Training and Test Set ######### #test_set test_text = sc.textFile("data/test_clean"+ str(part) + ".csv") test_df = test_text.map(lambda x : (0,x)).toDF(["nothing" , "sentence"]) #(0,x) = bricolage tokenizer_test = Tokenizer(inputCol="sentence", outputCol="words") wordsData_test = tokenizer_test.transform(test_df) df_test = wordsData_test nb_features_test = df_test.rdd.map(lambda x: len(x["words"])).sum() #training set text_positive = sc.textFile("data/training_positif_clean.csv") text_negative = sc.textFile("data/training_negatif_clean.csv") pos_labels = text_positive.map(lambda x: 1.0).zip(text_positive.map(lambda x : x)) neg_labels = text_negative.map(lambda x: 0.0).zip(text_negative.map(lambda x : x)) pos_df = pos_labels.toDF(["label" , "sentence"]) neg_df = neg_labels.toDF(["label" , "sentence"]) text_df = neg_df.union(pos_df)
def get_trending_news(rdd): if not rdd.isEmpty(): spark = getSparkSessionInstance(rdd.context.getConf()) df = spark.createDataFrame(rdd) # Append the title and summary together df_news_concat = df.withColumn("news_content", fn.concat_ws(" ", df.title, df.summary)) df_punc_removed = df_news_concat.withColumn( "news_content_removed", fn.regexp_replace(df_news_concat.news_content, "\p{Punct}", "")) udf_remove_unicode = fn.udf( lambda x: x.encode("ascii", "ignore").decode("ascii")) df_news_content_ascii = df_punc_removed.withColumn( "news_content_ascii", udf_remove_unicode(df_punc_removed.news_content_removed)) # insert raw data to the cassandra table df_news_content_ascii.select("id", "news_provider", "published", "summary", "title") \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table="travel_news_data", keyspace="news_stream_analysis") \ .save(mode="append") tokenizer = Tokenizer(inputCol="news_content_ascii", outputCol="content_words") df_tokenized_content = tokenizer.transform(df_news_content_ascii).drop( "news_content") remover = StopWordsRemover(inputCol="content_words", outputCol="filtered_words") stop_words = remover.loadDefaultStopWords("english") stop_words.extend([ '', "travel", "trip", "submitted", "abc", "reditt", "by", "time", "timing", "comments", "comment", "thank", "link", "im", "thanks", "would", "like", "get", "good", "go", "may", "also", "going", "dont", "want", "see", "take", "looking", "" ]) remover.setStopWords(stop_words) df_stop_words_removed = remover.transform(df_tokenized_content).drop( "content_words") cv = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures") cv_model = cv.fit(df_stop_words_removed) df_tf_data = cv_model.transform(df_stop_words_removed) df_features = df_tf_data.select( df_tf_data.rawFeatures.alias("features")) def convert_term_indices_to_term(term_indices, vocab): terms = [] for t in term_indices: terms.append(vocab[t]) return str(terms) # LDA lda = LDA(k=5, maxIter=50, learningOffset=8192.0, learningDecay=0.50) model = lda.fit(df_features) df_topics = model.describeTopics() fn_term_indices_to_term = fn.udf(convert_term_indices_to_term) vocab_lit = fn.array(*[fn.lit(k) for k in cv_model.vocabulary]) df_lda_result = df_topics.withColumn( "terms", fn_term_indices_to_term("termIndices", vocab_lit)) df_lda_result.select("topic", "termIndices", "terms").show(truncate=False) df_lda_result.cache() lda_terms = df_lda_result.select("terms").collect() lda_terms_list = [str(i.terms) for i in lda_terms] # based on model terms choose news stories for term_list in lda_terms_list: s = [] topic_words = term_list[1:-1].split(",") for term in topic_words: term = term.split("'")[1] s.append(r"(^|\W)" + str(term) + r"($|\W)") rx = '|'.join('(?:{0})'.format(x.strip()) for x in s) df_results = df_news_content_ascii.filter( df_news_content_ascii['news_content_ascii'].rlike(rx)) df_results = df_results.withColumn("topic_words", fn.lit(str(topic_words)[1:-1])) df_results = df_results.withColumn("results_date", fn.lit(datetime.datetime.now())) # insert results with the raw data to the cassandra table df_results.select("id", "news_provider", "published", "summary", "title", "topic_words", "results_date") \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table="travel_news_data_results", keyspace="news_stream_analysis") \ .save(mode="append")
cleanTweets = tweets.withColumn('hashtags',extractEntitiesUDF('text',F.lit(hashtagRegex)))\ .withColumn('mentions',extractEntitiesUDF('text',F.lit(mentionRegex)))\ .withColumn('cleanText',cleanTextUDF('text')) cleanTweets.select('text', 'cleanText', 'hashtags', 'mentions').show(5) # ## Step 6. Tokenize tweet text # We now want to take the cleansed tweet text and transform it into an array of tokens. To # do this, we need to: # * Tokenize each tweet text # * Remove any stop words in the text # * Stem any remaining words in the text # We will use Spark NLP functions to tokenize and remove stopwords, and NLTK to stem the words tokenizer = Tokenizer(inputCol="cleanText", outputCol="words") tokenizedTweets = tokenizer.transform(cleanTweets) swRemover = StopWordsRemover(inputCol="words", outputCol="nonStopWords") tokenizedTweetsNoSW = swRemover.transform(tokenizedTweets) def stem(words): stemmer = PorterStemmer() stemmed = [stemmer.stem(word) for word in words] return stemmed stemUDF = F.udf(stem, ArrayType(StringType())) stemmedTweets = tokenizedTweetsNoSW.withColumn('stemmedWords', stemUDF('nonStopWords'))
reviews.show() %pyspark reviews.createOrReplaceTempView("reviews2") reviewdf = sqlContext.sql("SELECT CASE WHEN helpful/total_votes> 0.6 THEN 1 ELSE 0 END AS Helpful_Score, reviewText, reviewLength, summaryLength,category,reviewCount, overall FROM reviews2") reviewdf.show(5) %pyspark import pyspark.sql.functions as func from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import StopWordsRemover tokenizer = Tokenizer(inputCol="reviewText", outputCol="words") wordsData = tokenizer.transform(reviewdf) #StopWords Exclution remover = StopWordsRemover(inputCol="words", outputCol="filtered") wordsData = remover.transform(wordsData) wordsData = wordsData.select('filtered', 'summaryLength', 'reviewLength','Helpful_Score','overall','reviewCount') wordsData.show(5) %pyspark #Hash TF on tokenized data hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=50) featurizedData = hashingTF.transform(wordsData) #TF-IDF Vectorizer idf = IDF(inputCol="rawFeatures", outputCol="features")
def preprocess_files(bucket_name, file_name): raw_data = sql_context.read.json("s3a://{0}/{1}".format( bucket_name, file_name)) # Clean article text print(colored("[PROCESSING]: Cleaning article text", "green")) clean_body = F.udf(lambda body: filter_body(body), StringType()) clean_article_data = raw_data.withColumn("cleaned_body", clean_body("text")) # Tokenize article text print(colored("[PROCESSING]: Tokenizing text vector...", "green")) tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="text_body_tokenized") tokenized_data = tokenizer.transform(clean_article_data) # Remove stop words print(colored("[PROCESSING]: Removing stop words", "green")) stop_words_remover = StopWordsRemover( inputCol="text_body_tokenized", outputCol="text_body_stop_words_removed") stop_words_removed_data = stop_words_remover.transform(tokenized_data) # Stem words print(colored("Stemming tokenized text", "green")) stem = F.udf(lambda tokens: lemmatize(tokens), ArrayType(StringType())) stemmed_data = stop_words_removed_data.withColumn( "text_body_stemmed", stem("text_body_stop_words_removed")) # Shingle resulting body print(colored("Shingling resulting text", "green")) shingle = F.udf(lambda tokens: get_n_gram_shingles(tokens, 3), StringType()) shingled_data = stemmed_data.withColumn("text_body_shingled", shingle("text_body_stemmed")) shingle_table = shingled_data.select('id', 'text_body_shingled') print(colored("Adding category/id mappings to Redis", "green")) # Create a mapping of article categories to article id's that fall under that category. Each key is an article category and the values the list of article id's. cat_id_map = raw_data.select( F.explode('categories').alias('category'), 'id').groupBy(F.col('category')).agg( F.collect_list('id').alias('ids_list')).where( F.size(F.col('ids_list')) < 200).withColumn( 'ids', to_str_udf('ids_list')) print(colored("Beginning writing category/id mapping to Redis", "green")) def write_cat_id_map_to_redis(rdd): rdb = redis.StrictRedis(config.REDIS_SERVER, port=6379, db=0) for row in rdd: rdb.sadd('cat:{}'.format(row.category), row.ids) cat_id_map.foreachPartition(write_cat_id_map_to_redis) print(cat_id_map.show(5, True)) print(colored("Finished writing category/id mapping to Redis", "green")) #Minhash calculations k = 100 random_seed = 50 masks = (np.random.RandomState(seed=random_seed).randint( np.iinfo(np.int64).min, np.iinfo(np.int64).max, k)) def update_min_hash_signature(word, min_hash_signature): root_hash = mmh3.hash64(pickle.dumps(word))[0] word_hashes = np.bitwise_xor( masks, root_hash ) # XOR root hash with k randomly generated integers to simulate k hash functions min_hash_signature = np.minimum(min_hash_signature, word_hashes) return min_hash_signature def calc_min_hash_signature(tokens): min_hash_signature = np.empty(k, dtype=np.int64) min_hash_signature.fill(np.iinfo(np.int64).max) for token in tokens: min_hash_signature = update_min_hash_signature( token, min_hash_signature) return min_hash_signature def compute_minhash(df): calc_min_hash_udf = F.udf( lambda x: str( list(map(lambda x: int(x), calc_min_hash_signature(x)))), StringType()) df = df.withColumn("min_hash", calc_min_hash_udf("text_body_shingled")).select( 'id', 'min_hash') return df print(colored("Computing minhash values", "green")) minhash_df = compute_minhash(shingle_table) print(colored("Finished computing minhash values", "green")) print(colored("Beginning writing minhash data to Redis", "green")) # Write minhash data to redis. If pipeline=True, use pipeline # method of inserting data in Redis def write_minhash_data_to_redis(rdd): rdb = redis.StrictRedis(config.REDIS_SERVER, port=6379, db=0) for row in rdd: rdb.sadd('id:{}'.format(row.id), row.min_hash) #print(minhash_df.show(5, True)) minhash_df.foreachPartition(write_minhash_data_to_redis) print(colored("Finished writing minhash data to Redis", "green")) print( colored("[UPLOAD]: Writing preprocessed data to database...", "green")) # write_aws_s3(config.S3_BUCKET, config.S3_FOLDER_PREPROCESSED, shingled_data) cf = configparser.ConfigParser() cf.read('../config/db_properties.ini')
from pyspark.ml.feature import Tokenizer from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tokenizer_sample") \ .master("local[*]") \ .getOrCreate() data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")] inputDF = spark.createDataFrame(data).toDF("id", "input") tokenizer = Tokenizer(inputCol="input", outputCol="output") outputDF = tokenizer.transform(inputDF) outputDF.printSchema() outputDF.show() spark.stop
label_message = label_message.withColumn( "msg", regexp_replace("msg", "sad|happy", "")) # union_message.show(100) # Hasing message # hashingTF = HashingTF(2000) # hashingTF = HashingTF() # hash_message = label_message.rdd.map(lambda row: (hashingTF.transform(row[0]), row[1])) # hash_message = spark.createDataFrame(hash_message, ["hash_msg", "is_happy"]) # hash_message = hash_message.withColumnRenamed("_1", "hash_msg") \ # .withColumnRenamed("_2", "is_happy") # label_message = label_message.withColumn("hash", hashingTF.transform(label_message.msg)) # label_message.show(100) # hash_message.show() tokenizer = Tokenizer(inputCol="msg", outputCol="token_msg") hash_message = tokenizer.transform(label_message) hasingTF = HashingTF(inputCol="token_msg", outputCol="hash_msg", numFeatures=2000) hash_message = hasingTF.transform(hash_message) # hash_message = label_message # Split messages into training and validation set label_indexer = StringIndexer(inputCol="is_happy", outputCol="indexed_label").fit(hash_message) feature_indexer = VectorIndexer(inputCol="hash_msg", outputCol="indexed_hash_msg").fit(hash_message) validation_set, training_set = hash_message.randomSplit([0.3, 0.7]) validation_set.show() training_set.show()
# Spark provides rich text analytics capabilities including nGram extraction, TF-IDF, # stop words removal, vectorization, and more that can be used to build machine learning # models based on textual data. # ### Sample of Maintenance Logs maintenance = spark.read.format("com.databricks.spark.csv").option("delimiter", "|")\ .load("maintenance/maintenance_logs.txt")\ .withColumnRenamed('_c0','date')\ .withColumnRenamed('_c1','note')\ .withColumnRenamed('_c2','duration')\ .withColumn('note', F.lower(F.regexp_replace('note', '[.!?-]', '')))\ .select(F.col('date').cast('date'), 'note', F.col('duration').cast('int')) maintenance.show(5, truncate=False) # ### Sample of 2-word nGrams on Maintenance Notes tk = Tokenizer(inputCol="note", outputCol="words") # Tokenize maintTokenized = tk.transform(maintenance) swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Remove stop-words maintFiltered = swr.transform(maintTokenized) ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # 2-word nGrams maintNGrams = ngram.transform(maintFiltered) maintNGrams.select('ngrams').show(5, truncate=False) # ### Topic Clustering using Latent Dirichlet Allocation (LDA) # LDA is a form of un-supervised machine learning that identifies clusters, or topics, # in the data cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=50)\ .fit(maintNGrams) # CountVectorize converts nGram array into a vector of counts maintVectors = cv.transform(maintNGrams) vocabArray = cv.vocabulary lda = LDA(k=3, maxIter=10) ldaModel = lda.fit(maintVectors)
from pyspark.ml.feature import IDF from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tf_idf_sample") \ .master("local[*]") \ .getOrCreate() df1 = spark.createDataFrame([(0, "a a a b b c"), (0, "a b c"), (1, "a c a a d")]).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") # 각 문장을 단어로 분리 df2 = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20) df3 = hashingTF.transform(df2) df3.cache() idf = IDF(inputCol="TF-Features", outputCol="Final-Features") idfModel = idf.fit(df3) rescaledData = idfModel.transform(df3) rescaledData.select("words", "TF-Features", "Final-Features").show() spark.stop
def sentimental_analysis(time, rdd): rdd = rdd.map(lambda x: json.loads(x[1])) text_array = rdd.collect() text_array = [ element["text"].lower() for element in text_array if "text" in element ] rdd = sc.parallelize(text_array) rdd = rdd.map(lambda x: x.replace(',', ' ')).map( lambda x: x.replace('/', ' ')).map(lambda x: x.replace('?', ' ')).map( lambda x: x.replace('...', ' ')).map(lambda x: x.replace('-', ' ')) rdd = rdd.map(lambda x: x.replace('.', ' ')).map( lambda x: x.replace('(', ' ')).map(lambda x: x.replace(')', ' ')).map( lambda x: x.replace('!', ' ')).map(lambda x: x.replace('|', ' ')) rdd = rdd.map(lambda sn: ' '.join( filter(lambda x: x.startswith( ('@', 'http', '"', '&', 'rt')) == False, sn.split()))) tweets_MAGA = rdd.filter(lambda x: "maga" in x).map(lambda x: [x, "MAGA"]) tweets_DICTATOR = rdd.filter(lambda x: "dictator" in x).map( lambda x: [x, "DICTATOR"]) tweets_IMPEACH = rdd.filter(lambda x: "impeach" in x).map( lambda x: [x, "IMPEACH"]) tweets_DRAIN = rdd.filter(lambda x: "drain" in x).map( lambda x: [x, "DRAIN"]) tweets_SWAMP = rdd.filter(lambda x: "swamp" in x).map( lambda x: [x, "SWAMP"]) tweets_COMEY = rdd.filter(lambda x: "comey" in x).map( lambda x: [x, "COMEY"]) tweets = tweets_DICTATOR.union(tweets_IMPEACH).union(tweets_DRAIN).union( tweets_SWAMP).union(tweets_COMEY).union(tweets_MAGA) set_tweets = tweets.map( lambda x: Row(sentence=str.strip(x[0]), label=x[1], date_time=time)) spark = getSparkSessionInstance(rdd.context.getConf()) partsDF = spark.createDataFrame(set_tweets) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) remover = StopWordsRemover( inputCol="words", outputCol="base_words") #define parameter of StopWordsRemover funtion base_words = remover.transform(tokenized) train_data_row = base_words.select("base_words", "label", "date_time") word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2vec.fit(train_data_row) final_train_data = model.transform(train_data_row) resul_analysis = classifier.transform(final_train_data) resul_analysis = resul_analysis.select("label", "date_time", "prediction") resul_analysis.createOrReplaceTempView("sentimental_analysis") resul_analysisDF = spark.sql( "select label, date_time, prediction, count(*) as total_label from sentimental_analysis group by label, date_time, prediction order by total_label" ) resul_analysisDF.write.mode("append").saveAsTable("sentimental_analysis")
def training_data(): rdd = sc.textFile("/user/sentimental_analysis/Subset100k.csv") #header = data.first() #rdd = data.filter(lambda row: row != header) r = rdd.mapPartitions(lambda x: csv.reader(x)) #r2 = r.map(lambda x: (x[3], int(x[1]))) part = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1])) ) #put a schema and make data Frame spark = getSparkSessionInstance(rdd.context.getConf()) partsDF = spark.createDataFrame(part) #partDF.show() tokenizer = Tokenizer( inputCol="sentence", outputCol="words") #define parameter of Tokenizer funtion tokenized = tokenizer.transform( partsDF) # tokenizer split the sentences by row #tokenized.show() remover = StopWordsRemover( inputCol="words", outputCol="base_words") #define parameter of StopWordsRemover funtion base_words = remover.transform(tokenized) #base_words.show() train_data_row = base_words.select("base_words", "label") word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2vec.fit(train_data_row) final_train_data = model.transform(train_data_row) #final_train_data.show() final_train_data = final_train_data.select("label", "features") lr = LogisticRegression(maxIter=10000, regParam=0.001, elasticNetParam=0.0001) lrModel = lr.fit(final_train_data) #lrModel.transform(final_train_data).show() return lrModel #------------------------------------------------ Validacion Data2 ---------------------------------------# rdd = sc.textFile("/user/sentimental_analysis/Subset100k.csv") r = rdd.mapPartitions(lambda x: csv.reader(x)) part = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1]))) spark = getSparkSessionInstance(rdd.context.getConf()) partsDF = spark.createDataFrame(part) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) remover = StopWordsRemover( inputCol="words", outputCol="base_words") #define parameter of StopWordsRemover funtion base_words = remover.transform(tokenized) train_data_row = base_words.select("base_words", "label") word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2vec.fit(train_data_row) final_train_data = model.transform(train_data_row) final_train_data = final_train_data.select("label", "features") #lrModel.transform(final_train_data).show() print( "********************************** Todo bajo control ***********************************" )
.appName("SimpleApplication") \ .getOrCreate() print("Загружаем данные...") input_data = spark.sparkContext.wholeTextFiles(PATH) print("Готовим данные...") prepared_data = input_data.map(lambda x: (get_patent_name(x[1]), get_claims(x[1]))) \ .map(lambda x: (x[0], remove_punctuation(x[1]))) \ .map(lambda x: (x[0], remove_linebreaks(x[1]))) prepared_df = prepared_data.toDF().selectExpr('_1 as patent_name', '_2 as patent_claims') print("Разбиваем текст на токены...") tokenizer = Tokenizer(inputCol="patent_claims", outputCol="words") words_data = tokenizer.transform(prepared_df) print("Фильтруем токены...") filtered_words_data = words_data.rdd.map(lambda x: (x[0], x[1], get_only_words(x[2]))) filtered_df = filtered_words_data.toDF().selectExpr('_1 as patent_name', '_2 as patent_claims', '_3 as words') print("Удаляем стоп-слова...") remover = StopWordsRemover(inputCol='words', outputCol='filtered') filtered = remover.transform(filtered_df) print("Считаем признаки...") vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() print("Считаем относительные частоты признаков...")
.appName("e8_3") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) spamFile = sc.textFile("Spam.csv") fileLineTokens = spamFile.map(lambda line: (line.split(",", 1)[0], line.split(",", 1)[1])) fileDf = sqlContext.createDataFrame(fileLineTokens, ["categ", "text"]) training_data, testing_data = fileDf.randomSplit([0.8, 0.2]) categoryIndexerUnit = StringIndexer(inputCol="categ", outputCol="label") tokenizerUnit = Tokenizer(inputCol="text", outputCol="words") tokenizedWords = tokenizerUnit.transform(fileDf) stoRemover = StopWordsRemover(inputCol="words", outputCol="clean_content") tweetWordsData = stoRemover.transform(tokenizedWords) hashingTermFrequecies = HashingTF(inputCol="clean_content", outputCol="rawFeatures", numFeatures=500) featuredData = hashingTermFrequecies.transform(tweetWordsData) idfData = IDF(inputCol="rawFeatures", outputCol="features") naiveBaysClassifier = NaiveBayes(smoothing=1.0, modelType="multinomial") # here I didn't use idf model because there is only one main doc and through hashing tf naive bays can idetify between sapm or not pipeline = Pipeline(stages=[ categoryIndexerUnit, tokenizerUnit, stoRemover, hashingTermFrequecies, idfData, naiveBaysClassifier ])
#normalized = " ".join(lemma.lemmatize(word,'v') for word in words) text_out = " ".join(lemma.lemmatize(word,'v') for word in text_out) return text_out udf_cleantext = udf(cleanup_text, StringType()) clean_text = data.withColumn("clean_comm", udf_cleantext(data.comment_text)) #clean_text.select("clean_comm").show(3) tokenizer = Tokenizer(inputCol="clean_comm", outputCol="tokens") tokenized = tokenizer.transform(clean_text) e = tokenized.select("clean_comm", "tokens") #tokenized.select("clean_comm", "tokens").show(1) remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_Stop") t = remover.transform(e) tt = t.select('tokens_Stop') #tt.show(2) tt = tt.select('tokens_Stop') tt = tt.limit(650000) #you can choose number or comments you want to run LDA on tt.count() #tt.toPandas().to_csv('cleaning.csv') #Uncomment if oyu want to save a cleaned model
spark_train = spark_train.filter(~isnull('reflection_period')) spark_train = spark_train.filter(~isnull('cleaned_hm')) spark_train = spark_train.filter(~isnull('num_sentence')) spark_train = spark_train.filter(~isnull('predicted_category')) #For mapping labels prediction_scores = spark_train.groupBy("predicted_category").count().orderBy(col("count").desc()) pd_df_train =prediction_scores.toPandas() pd_df_train['predict_score'] = np.arange(len(pd_df_train)) spark_df = spark.createDataFrame(pd_df_train) spark_df = spark_df.drop('count') spark_df = spark_df.selectExpr("predicted_category as predicted_category_table", "predict_score as predict_score") #Tokenizing and Vectorizing tok = Tokenizer(inputCol="cleaned_hm", outputCol="words") review_tokenized = tok.transform(spark_train) stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw') review_tokenized = stopword_rm.transform(review_tokenized) cv = CountVectorizer(inputCol='words_nsw', outputCol='tf') cvModel = cv.fit(review_tokenized) count_vectorized = cvModel.transform(review_tokenized) idf_ngram = IDF().setInputCol('tf').setOutputCol('tfidf') tfidfModel_ngram = idf_ngram.fit(count_vectorized) tfidf_df = tfidfModel_ngram.transform(count_vectorized) word_indexer_pc = StringIndexer(inputCol="predicted_category", outputCol="predicted_category_new", handleInvalid="error") #Splitting the training data into training data and validation data
outputCol="originalCategory") converted = converter.transform(dataIndexed) converted.show() labelLookup = converted.dropDuplicates(['indexedLabel', 'originalCategory' ]).select('indexedLabel', 'originalCategory') # something tokenizer = Tokenizer(inputCol="value", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") tokenized = tokenizer.transform(converted) removed = remover.transform(tokenized) tokenized.show() removed.show() # Extracting Features hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features", numFeatures=20) hashed = hashingTF.transform(removed) hashed.show() # Making Labeled Point
{'field': 'manufacturer', 'type': 'String'}, {'field': 'model', 'type': 'String'}, {'field': 'family', 'type': 'String'}, ] gazetteer = Gazetteer(fields) # read in listings from json file # specifying fields makes the parsing more efficient in Spark listing_fields = [StructField("title", StringType(), True), StructField("manufacturer", StringType(), True), StructField("currency", StringType(), True), StructField("price", StringType(), True), ] listings = sqlContext.read.json(LISTINGS_PATH, StructType(listing_fields)).distinct() # break listing title into words tokenizer = Tokenizer(inputCol="title", outputCol="words") listings = tokenizer.transform(listings) # read in products from json file product_fields = [StructField("product_name", StringType(), True), StructField("manufacturer", StringType(), True), StructField("family", StringType(), True), StructField("model", StringType(), True), ] products = sqlContext.read.json(PRODUCTS_PATH, StructType(product_fields))\ .fillna({'family': ''}) # replace nulls in family fields products_df, products_dict = canonical_format(products, Product) listings_df, listings_dict = canonical_format(listings, Listing) products_training_dict = json.load(open(PRODUCTS_TRAINING_PATH)) listings_training_dict = json.load(open(LISTINGS_TRAINING_PATH))
def transform(spark, s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data): print('Processing {} => {}'.format(s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data)) schema = StructType([ # StructField('is_positive_sentiment', IntegerType(), True), StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, sep='\t', schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() tokenizer = Tokenizer(inputCol='review_body', outputCol='words') wordsData = tokenizer.transform(df_csv_cleaned) hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) featurizedData = hashingTF.transform(wordsData) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # 1) compute the IDF vector # 2) scale the term frequencies by IDF # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass featurizedData.cache() # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) idfModel = idf.fit(featurizedData) features_df = idfModel.transform(featurizedData) features_df.select('star_rating', 'features').show() # TODO: Use SVD instead # features_vector_rdd = features_df.select('features').rdd.map( lambda row: Vectors.fromML(row.getAs[MLVector]('features') ) # features_vector_rdd.cache() # mat = RowMatrix(features_vector_rdd) # k = 300 # svd = mat.computeSVD(k, computeU=True) # TODO: Reconstruct num_features = 300 pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') pca_model = pca.fit(features_df) pca_features_df = pca_model.transform(features_df).select( 'star_rating', 'pca_features') pca_features_df.show(truncate=False) standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') standard_scaler_model = standard_scaler.fit(pca_features_df) standard_scaler_features_df = standard_scaler_model.transform( pca_features_df).select('star_rating', 'scaled_pca_features') standard_scaler_features_df.show(truncate=False) expanded_features_df = (standard_scaler_features_df.withColumn( 'f', to_array(col('scaled_pca_features'))).select( ['star_rating'] + [col('f')[i] for i in range(num_features)])) expanded_features_df.show() train_df, validation_df, test_df = expanded_features_df.randomSplit( [0.9, 0.05, 0.05]) # Removed overwrite to test for this issue # https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz train_df.write.csv(path=s3_output_train_data, header=None, quote=None) #, # mode='overwrite') print('Wrote to output file: {}'.format(s3_output_train_data)) # Removed overwrite to test for this issue # https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz validation_df.write.csv(path=s3_output_validation_data, header=None, quote=None) #, # mode='overwrite') print('Wrote to output file: {}'.format(s3_output_validation_data)) # Removed overwrite to test for this issue # https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz test_df.write.csv(path=s3_output_test_data, header=None, quote=None) #, # mode='overwrite') print('Wrote to output file: {}'.format(s3_output_test_data))
def makeWord2VecModel(): cursor = News.find({}) text = "" for news in cursor: text += news['text'] with open(os.path.join(os.getcwd(), 'word2Vec.txt'), 'w', encoding='utf-8') as inputFile: inputFile.writelines(text) spark = SparkSession.builder.appName("SimpleApplication").getOrCreate() # Построчная загрузка файла в RDD input_file = spark.sparkContext.textFile('word2Vec.txt') print(input_file.collect()) prepared = input_file.map(lambda x: ([x])) df = prepared.toDF() prepared_df = df.selectExpr('_1 as text') # Разбить на токены tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) # Удалить стоп-слова stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(words) # Вывести стоп-слова для русского языка print(stop_words) # Вывести таблицу filtered filtered.show() # Вывести столбец таблицы words с токенами до удаления стоп-слов words.select('words').show(truncate=False, vertical=True) # Вывести столбец "filtered" таблицы filtered с токенами после удаления стоп-слов filtered.select('filtered').show(truncate=False, vertical=True) # Посчитать значения TF vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() vocabulary = vectorizer.vocabulary # Вывести таблицу со значениями частоты встречаемости термов. featurized_data.show() # Вывести столбец "raw_features" таблицы featurized_data featurized_data.select('raw_features').show(truncate=False, vertical=True) # Вывести список термов в словаре print(vocabulary) # Посчитать значения DF idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Вывести таблицу rescaled_data rescaled_data.show() # Вывести столбец "features" таблицы featurized_data rescaled_data.select('features').show(truncate=False, vertical=True) # Построить модель Word2Vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='words', outputCol='result') model = word2Vec.fit(words) w2v_df = model.transform(words) w2v_df.show() persons = [] cPersons = db.Persones.find({}) for secName in cPersons: persons.append(secName['sName']) synonyms = [] i = 0 synonyms.append(model.findSynonyms('погибла', 2)) for word, cosine_distance in synonyms: print(str(word)) spark.stop()
sqlContext = SQLContext(sc) pdDF = pd.read_csv('Megadados-Projeto2/lyrics.csv') mySchema = StructType([ StructField("index", LongType(), True)\ ,StructField("song", StringType(), True)\ ,StructField("year", IntegerType(), True)\ ,StructField("artist", StringType(), True)\ ,StructField("genre", StringType(), True)\ ,StructField("lyrics", StringType(), True)]) df = sqlContext.createDataFrame(pdDF, schema=mySchema) tokenizer = Tokenizer(inputCol="lyrics", outputCol="words") wordsDataFrame = tokenizer.transform(df) #remove 20 most occuring documents, documents with non numeric characters, and documents with <= 3 characters cv_tmp = CountVectorizer(inputCol="words", outputCol="tmp_vectors") cv_tmp_model = cv_tmp.fit(wordsDataFrame) top20 = list(cv_tmp_model.vocabulary[0:20]) more_then_3_charachters = [ word for word in cv_tmp_model.vocabulary if len(word) <= 3 ] contains_digits = [ word for word in cv_tmp_model.vocabulary if any(char.isdigit() for char in word) ] stopwords = [] #Add additional stopwords in this list
def preprocess_tweets(tweets): tokenizer = Tokenizer(inputCol="text", outputCol="words") tweets = tokenizer.transform(tweets) remover = StopWordsRemover(inputCol="words", outputCol="filtered") tweets = remover.transform(tweets) return tweets
inferSchema=True, escape='"', multiLine=True) #filter all null values in company dataset companies = companies.filter(companies.description.isNotNull()) companies = companies.filter(companies.industry.isNotNull()) #join the two datasets joined_df = companies.join(all_data, companies['company name'] == all_data.company) joined_df.show() #generate tokenizer tokenizer = Tokenizer(inputCol='position', outputCol='token') all_data = tokenizer.transform(all_data) #generate ngrams ngram = NGram(n=2, inputCol="token", outputCol="ngrams") all_data = ngram.transform(all_data) #explode, split , group and count all_data.select(['ngrams', 'location']).select('location', F.explode('ngrams').alias('ngrams')) cities = all_data.select(['ngrams', 'location']).select( F.explode('ngrams').alias('ngrams'), F.split(all_data['location'], ',')[0].alias('city')) cities.groupBy(['ngrams', 'city']).count().orderBy("count", ascending=False).show()
return row_with_index(*[uid] + [row_dict.get(c) for c in columns]) return _make_row f = make_row(df.columns) indexed = (df.rdd .zipWithUniqueId() .map(lambda x: f(*x)) .toDF(StructType([StructField("id", LongType(), False)] + df.schema.fields))) ## tokenizing the reviews, removing stopwords, stemming and storing the results in a dataframe # tokenize tokenizer = Tokenizer(inputCol="text", outputCol="tokens") tokenized = tokenizer.transform(indexed) print 'done' # remove stop words stopwordList = ['','get','got','also','really','would','one','good','like','great','tri','love','two','three','took','awesome', 'me','bad','horrible','disgusting','terrible','fabulous','amazing','terrific','worst','best','fine','excellent','acceptable', 'my','exceptional','satisfactory','satisfying','super','awful','atrocious','unacceptable','poor','sad','gross','authentic', 'myself','cheap','expensive','we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn',
# and return the result. return " ".join(meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/unlabeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row, index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map( lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label", "features") lp = selectData.map(lambda x: LabeledPoint(x.label, x.features)) (trainingData, testData) = lp.randomSplit([0.6, 0.4]) nb = NaiveBayes.train(trainingData, 1.0)
parts = lines.map(lambda l: l.split(",")) f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1)) linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt") partst = linest.map(lambda l: l.split(",")) ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0)) alldata = f.union(ft) schemaApp = sqlContext.createDataFrame(alldata) schemaApp.registerTempTable("data") tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms") permsData = tokenizer.transform(schemaApp) hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures") featurizedData = hashingTF.transform(permsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1])) model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100) labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features)))
from pyspark.sql import SQLContext def preProcess(doc): clean = doc.replace("<br /><br />"," ") return clean.lower() rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1])) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTrainTok = tokenizer.transform(dfTrain) import itertools lists=dfTrainTok.map(lambda r : r.review).collect() dictWords=set(itertools.chain(*lists)) dictionaryWords={} for i,word in enumerate(dictWords): dictionaryWords[word]=i dict_broad=sc.broadcast(dictionaryWords) from pyspark.mllib.linalg import SparseVector def vectorize(row,dico): vector_dict={} for w in row.words: if w in dico:
f = indexedTweets.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][0], label= int(float(p[0][1])), training=1)) #f = parts.map(lambda p: Row(tweet=p[0],label=int(p[1]))) linest = sc.textFile("/home/ankita/MLProject/SVM/GroundTruth.txt") partst = linest.map(lambda l: l.split(",")) indexedTweetst = partst.zipWithIndex().map(lambda (a,b): (a,b+trainingCount)) ft = indexedTweetst.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][1], label= int(float(p[0][0])),training=0)) alldata = f.union(ft) schemaTweets = sqlContext.createDataFrame(alldata) schemaTweets.registerTempTable("data") tokenizer = Tokenizer(inputCol="tweet", outputCol="words") wordsData = tokenizer.transform(schemaTweets) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #rescaledData.collect() wordsvectors = rescaledData.filter(rescaledData.training==1)["label","features"].map(lambda row: LabeledPoint(row[0], row[1])) model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. Remove stop words meaningful_words = [w for w in words if not w in stops] # # 5. Join the words back into one string separated by space, # and return the result. return " ".join( meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features")
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
# limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Tokenizer, RegexTokenizer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="TokenizerExample") sqlContext = SQLContext(sc) # $example on$ sentenceDataFrame = sqlContext.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsDataFrame = tokenizer.transform(sentenceDataFrame) for words_label in wordsDataFrame.select("words", "label").take(3): print(words_label) regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) # $example off$ sc.stop()
#LOADING DATA FROM HDFS TO SPARK DATAFRAME df0=spark.read.option("sep", "\t").option('header',True).csv('hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv') df0.printSchema() #FILTERING FOR EMPTY VALUES df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull())) #ENCODING LABEL stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res") ppl = Pipeline(stages=[stage_string]) df1 = ppl.fit(df01).transform(df01) #CREATING TF_IDF tokenizer = Tokenizer(inputCol="review_body", outputCol="words") wordsData = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #NAIVEBAYES nb = NaiveBayes(featuresCol="features", labelCol="class_res") #Model training model = nb.fit(rescaledData) #Model Saving model.write().overwrite().save("./NB_model")
print "Create dataframe" t0 = time() df = sqlContext.createDataFrame(rdd, ['review', 'label']) print "Showing first example : " print print df.first() tt = time() - t0 print print "Dataframe created in {} second".format(round(tt,3)) # In[314]: from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTok = tokenizer.transform(df) # In[315]: from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTok) # In[317]: print "Start tokenizing, computing bigrams and splitting between test and train" t0 = time() dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2]) dfTrain.take(1)
data = df.rdd.map(list) print(data.first()) score = data.map(lambda s: 1.0 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0) comment = data.map(lambda s: s[3]) #words = comment.map(lambda w:"/".join(jieba.cut_for_search(w))).map(lambda line:line.split("/")) split_neg_data2 = score.zip(comment) tranform_data = split_neg_data2.map( lambda p: (p[0], p[1])) #.toDF()#.withColumnRenamed('_1','label') #tranform_data.show() #sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence") sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) #计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) rfClassifier = RandomForestClassifier(numTrees=10,
print "################" #Step 0:clean the data print "check the types" #print fulldata.dtypes print "################" print "CLEANING Data:" fulldata["product_title_clean"] = fulldata["product_title"].apply(cf.rmP) #print fulldata["product_title_clean"].head() #TF-IDF features #Step 1: split text field into words print "STEP 1################" tokenizer = Tokenizer(inputCol="product_title_clean", outputCol="words_title") fulldata = tokenizer.transform(fulldata) print "Tokenized Title:" print fulldata.head() print "################" #Step 2: compute term frequencies hashingTF = HashingTF(inputCol="words_title", outputCol="tf") fulldata = hashingTF.transform(fulldata) print "TERM frequencies:" print fulldata.head() print "################" #Step 3: compute inverse document frequencies idf = IDF(inputCol="tf", outputCol="tf_idf") idfModel = idf.fit(fulldata) fulldata = idfModel.transform(fulldata) print "IDF :" print fulldata.head()
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("TfIdfExample")\ .getOrCreate() # $example on$ sentenceData = spark.createDataFrame([ (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) for features_label in rescaledData.select("features", "label").take(3): print(features_label) # $example off$ spark.stop()
airportCleanDF = airportCleanDF.withColumn("airport_staff_ratingf", fn.col("airport_staff_rating").cast("float")) airportCleanDF = airportCleanDF.withColumn("recommendedi", fn.col("recommended").cast("integer")) airportCleanDF = reduce(DataFrame.drop, ['overall_rating','queuing_rating', 'terminal_cleanliness_rating', 'terminal_seating_rating', 'terminal_signs_rating', 'food_beverages_rating', 'airport_shopping_rating', 'wifi_connectivity_rating', 'airport_staff_rating','recommended'], airportCleanDF) # In[7]: from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer().setInputCol('content').setOutputCol('words') # In[8]: airlineCleanDF = airlineCleanDF.na.drop(subset=["content"]) # Remove rows with NULL in column 'content' tokenizer.transform(airlineCleanDF) # In[7]: #tokenizer.transform(airlineCleanDF).show(5) # In[5]: from pyspark.ml.feature import CountVectorizer # In[9]: count_vectorizer_estimator = CountVectorizer().setInputCol('words').setOutputCol('features')
# COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.drop("features")) cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("features")\ .setVocabSize(500)\ .setMinTF(0)\ .setMinDF(0)\ .setBinary(True) cvFitted = cv.fit(tokenized) prepped = cvFitted.transform(tokenized) # COMMAND ---------- from pyspark.ml.clustering import LDA lda = LDA().setK(10).setMaxIter(5)
# TF: Term Frequency ---> importance of the therm within a given document # IDF: importance of the term in the corpus (full dictionary of words and documents) spark = SparkSession.builder.appName('NLP').getOrCreate() sen_df = spark.createDataFrame([(0, 'Hi I heard about Spark'), (1, 'I whish java could use case classes'), (2, 'Logistic,regression,models,are,neat')], ['id', 'sentence']) sen_df.show() tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words') regex_tokenizer = RegexTokenizer(inputCol = 'sentence', outputCol = 'words', pattern = '\\W') count_tokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sen_df) tokenized.withColumn('tokens', count_tokens(col("words"))).show() # Remvove commas INSIDE the words rg_tokenized = regex_tokenizer.transform(sen_df) rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show() # Remove stop words sentence_df = spark.createDataFrame([(0, ['I', 'saw', 'the', 'green', 'horse']), (1, ['Mary', 'had', 'a', 'little', 'lamb'])], ['id', 'tokens']) sentence_df.show() remover = StopWordsRemover(inputCol = 'tokens', outputCol = 'filtered') remover.transform(sentence_df).show() # n-gram wordDataFrame = spark.createDataFrame([
import argparse from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row from pyspark.ml.feature import Tokenizer def filter_comments(df): return df.filter(df['author'] != '[deleted]') \ .filter(df['body'] != '[deleted]') \ .filter(df['body'] != '[removed]') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Reddit Comment Prediction') parser.add_argument('-i', '--input_file', type=str, help="""The CSV input data file that contains the raw comment data""") args = parser.parse_args() sc = SparkContext("local", "Prediction") sqlContext = SQLContext(sc) df = sqlContext.read.json(args.input_file) print 'Loaded input file {} with {} total comments'.format(args.input_file, df.count()) filtered = filter_comments(df) print '{} comments after filtering'.format(filtered.count()) tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(filtered) wordsDataFrame.select("body", "words").show()
from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import Tokenizer, CountVectorizer from pyspark.ml.clustering import LDA from pyspark.sql import functions as F from pyspark.sql.types import StructType, StructField, LongType from pyspark.sql.functions import col spark = SparkSession.builder.appName('Project').getOrCreate() dataset=spark.read.csv("reviews.tbl", inferSchema = True, header = True, sep = '|') dataset.createTempView("product_reviews") q="SELECT CASE pr_rating WHEN 1 THEN 'NEG' WHEN 2 THEN 'NEG' WHEN 3 THEN 'NEU' WHEN 4 THEN 'POS' WHEN 5 THEN 'POS' END AS pr_r_rating, pr_content FROM product_reviews WHERE pmod(pr_review_id, 5) IN (1,2,3)" df = spark.sql(q).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="tokens") wordsData = tokenizer.transform(df) # remove stop words remover = StopWordsRemover(inputCol="tokens", outputCol="words") cleaned = remover.transform(wordsData) # vectorize cv = CountVectorizer(inputCol="words", outputCol="features") count_vectorizer_model = cv.fit(cleaned) result = count_vectorizer_model.transform(cleaned) #corpus = result.select('vectors').rdd.map(lambda x: Row (x[0])).toDF() #corpus=corpus.select(col("_1").alias("features")) ldaModel = LDA(k=4, maxIter =100) model = ldaModel.fit(result) # extracting topics
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) return myprediction """连接master""" conf = SparkConf().setAppName('tfidf').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\
def strip_tags(html): return parser.unescape( expression.sub('', html) ) strip_tags_udf = udf(strip_tags) tokenizer = Tokenizer(inputCol="comment_clean", outputCol="words") stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens") # Load data comments = sqlContext.read.json(fn) # Calcualte tokens dataframe as one pipeline tokens = stopWordsRemover.transform( tokenizer.transform(comments\ .withColumn("comment_clean", strip_tags_udf(comments["comment_text"]))\ )\ )\ .select(explode("tokens").alias("token"))\ .groupBy("token")\ .count()\ .orderBy("count", ascending=False)\ .select("count")\ .limit(1000) # Switch to Pandas tokens_pdf = tokens.toPandas() tokens_pdf = tokens_pdf.ix[1:] tokens_pdf["rank"] = range(1, tokens_pdf.shape[0] + 1) print(tokens_pdf.head())