def tf_idf_feature(wordsData): hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) for features_label in rescaledData.select("features", "id").take(3): print(features_label)
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def extract_idf_features(p_df, input_col, output_col): """ Extracts IDF features. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ idf = IDF(inputCol=input_col, outputCol=output_col) idfModel = idf.fit(p_df) return idfModel.transform(p_df)
def run_tf_idf_spark_ml(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) return idfModel.transform(featurizedData)
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n): global idfModel hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n) featurizedData = hashingTF.transform(dataframe) idf = IDF(inputCol=in_col2, outputCol=out_col2) idfModel = idf.fit(featurizedData) dataframe = idfModel.transform(featurizedData) return dataframe
def create_features(raw_data): #Create DataFrame data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2]))) #Transform sentence into words tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_df = tokenizer.transform(data_df) #Calculate term frequency hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5) featurized_df = hashingTF.transform(words_df) #Calculate inverse document frequency idf = IDF(inputCol='rawFeatures', outputCol='features') idfModel = idf.fit(featurized_df) return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col): #from pyspark.sql.functions import udf #from pyspark.sql.types import * output_raw_col = ip_col+"raw_features" output_col = ip_col+"features" hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features) featurizedData = hashingTF.transform(df) idf = IDF(inputCol=output_raw_col, outputCol=output_col) idfModel = idf.fit(featurizedData) rescaled_data = idfModel.transform(featurizedData) rescaled_data.show(5) print(rescaled_data.count()) return rescaled_data
def makeTFIDF(sc, spark, reviews): # count vectorizer and tfidf # cv = CountVectorizer(inputCol='words_clean', outputCol='tf') # cvModel = cv.fit(reviews) # reviews = cvModel.transform(reviews) # HashingTF for fewer dimensions: hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000) reviews = hashingtf.transform(reviews) # create TF-IDF matrix idf = IDF().setInputCol('tf').setOutputCol('tfidf') tfidfModel = idf.fit(reviews) reviews = tfidfModel.transform(reviews)
def test_idf(self): dataset = self.spark.createDataFrame([ (DenseVector([1.0, 2.0]),), (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) idf0 = IDF(inputCol="tf") self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) self.assertEqual(idf0m.uid, idf0.uid, "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) # Test that parameters transferred to Python Model check_params(self, idf0m)
def tf_idf(df, column): """ Compute TF-IDF of a corpus. Transformation: array<string> --> vector """ df = preprocess(df, column) # text to list of terms (df, voc) = count(df, column) # creates a TF-IDF model and uses it to compute the feature vector. idf = IDF(inputCol=column, outputCol='_'+column) model = idf.fit(df) df = model.transform(df) df = replace(df, column, '_'+column) return (df, voc)
def append_tf_idf(self, df): """ Calculate term frequency and inverse document frequency based on at least 1 visit hourly in this case. Compares how often the tokens appeared at least once per hour compared to other tokens. Not used for the main purpose of the project. Args: :param df: Dataframe parameter. Returns: :return: Dataframe with term frequency and inverse document frequency added in the columns 'rawFeatures' and 'features' respectively. """ #Create TF column. hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000) tf = hashingTF.transform(df) tf.persist(StorageLevel.MEMORY_AND_DISK) #Create IDF column. idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) return tfidf
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
def train_model_sentences_with_person(): sentences_with_person_collection = get_db_collection_object( 'SentencesWithPerson') with open("sentences_with_person.txt", "w", encoding='utf-8') as file_sentences_with_person: for sen in sentences_with_person_collection.find(): file_sentences_with_person.write('{0}\n'.format(sen['sentence'])) spark = SparkSession \ .builder \ .appName("SentenceProcessor") \ .getOrCreate() input_data = spark.sparkContext.textFile('./sentences_with_person.txt') prepared_data = input_data.map(lambda x: (x, len(x))) prepared_data = prepared_data.filter(lambda x: x[1] > 0) prepared_df = prepared_data.toDF().selectExpr('_1 as sentence', '_2 as length') # prepared_df.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") words_data = tokenizer.transform(prepared_df) # words_data.show(truncate=False) # Отфильтровать токены, оставив только слова filtered_words_data = words_data.rdd.map( lambda x: (x[0], x[1], get_only_words(x[2]))) filtered_df = filtered_words_data.toDF().selectExpr( '_1 as sentence', '_2 as length', '_3 as words') # filtered_df.show() # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.) stop_words = stopwords.words('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(filtered_df) # normalize_words_data = filtered.rdd.map( lambda x: (x[0], x[1], x[2], normalization_sentence(x[3]))) normalized_df = normalize_words_data.toDF().selectExpr( '_1 as sentence', '_2 as length', '_3 as words', '_4 as normalize_words') # normalized_df.show() # vectorizer = CountVectorizer(inputCol='normalize_words', outputCol='raw_features').fit(normalized_df) featurized_data = vectorizer.transform(normalized_df) featurized_data.cache() # idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Построить модель Word2Vec word2Vec = Word2Vec(vectorSize=300, minCount=0, inputCol='normalize_words', outputCol='result') doc2vec_pipeline = Pipeline(stages=[tokenizer, word2Vec]) model = word2Vec.fit(rescaled_data) w2v_df = model.transform(rescaled_data) # w2v_df.show(truncate=False) # print(model.findSynonyms('бочаров', 2).show()) # sc = spark.sparkContext path = './models/model_person' # # print(sc, path) model.write().overwrite().save(path) #m = Word2Vec.load('./models/model_person/') # pickle.dump(model, './models/model_person/mp.model') spark.stop()
# Step 1: split text field into words tokenizer = Tokenizer(inputCol="text_clean", outputCol="text_token") fulldata = tokenizer.transform(fulldata) print "Tokenized Text:" print fulldata.head() print "################" # Step 2: compute term frequencies hashingTF = HashingTF(inputCol="text_token", outputCol="tf", numFeatures=10000) fulldata = hashingTF.transform(fulldata) print "TERM frequencies:" print fulldata.head() print "################" # Step 3: compute inverse document frequencies idf = IDF(inputCol="tf", outputCol="tf_idf") idfModel = idf.fit(fulldata) fulldata = idfModel.transform(fulldata) print "IDF :" print fulldata.head() print "################" #OK we do the same for the search term # Step 1: split text field into words tokenizer = Tokenizer(inputCol="search_term_clean", outputCol="search_token") fulldata = tokenizer.transform(fulldata) print "Tokenized Search:" print fulldata.head() print "################" # Step 2: compute term frequencies hashingTF = HashingTF(inputCol="search_token", outputCol="tf_s",
.load(os.path.realpath("Womens Clothing E-Commerce Reviews.csv")) reviews = data.map(lambda x : x['Review Text']).filter(lambda x: x is not None) tokens = reviews \ .map( lambda document: document.strip().lower()) \ .map( lambda document: re.split(" ", document)) \ .map( lambda word: [x for x in word if x.isalpha()]) \ .map( lambda word: [x for x in word if len(x) > 3] ) \ .map( lambda word: [x for x in word if x not in StopWords]) \ .zipWithIndex() row_rdd = rdd1.map(lambda x: Row(x)) df=sqlContext.createDataFrame(row_rdd,['numbers']).show() cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) model = cv.fit(df) result = model.transform(df) result.show(truncate=False) from pyspark.sql.functions import monotonicallyIncreasingId res = df.withColumn("id", monotonicallyIncreasingId()) df_txts = sqlContext.createDataFrame(row_rdd, ["list_of_words",'index']) idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv)
def transform(spark, s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data): print('Processing {} => {}'.format(s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data)) schema = StructType([ StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, sep='\t', schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() tokenizer = Tokenizer(inputCol='review_body', outputCol='words') wordsData = tokenizer.transform(df_csv_cleaned) hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) featurizedData = hashingTF.transform(wordsData) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # 1) compute the IDF vector # 2) scale the term frequencies by IDF # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass featurizedData.cache() # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) idfModel = idf.fit(featurizedData) features_df = idfModel.transform(featurizedData) features_df.select('star_rating', 'features').show() num_features = 300 pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') pca_model = pca.fit(features_df) pca_features_df = pca_model.transform(features_df).select( 'star_rating', 'pca_features') pca_features_df.show(truncate=False) standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') standard_scaler_model = standard_scaler.fit(pca_features_df) standard_scaler_features_df = standard_scaler_model.transform( pca_features_df).select('star_rating', 'scaled_pca_features') standard_scaler_features_df.show(truncate=False) expanded_features_df = (standard_scaler_features_df.withColumn( 'f', to_array(col('scaled_pca_features'))).select( ['star_rating'] + [col('f')[i] for i in range(num_features)])) expanded_features_df.show() train_df, validation_df, test_df = expanded_features_df.randomSplit( [0.9, 0.05, 0.05]) train_df.write.csv(path=s3_output_train_data, header=None, quote=None) #, print('Wrote to output file: {}'.format(s3_output_train_data)) validation_df.write.csv(path=s3_output_validation_data, header=None, quote=None) #, print('Wrote to output file: {}'.format(s3_output_validation_data)) test_df.write.csv(path=s3_output_test_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_test_data))
def main(root_path): timeStamp = str(int(time())) # todo change this for full run num = 1000 # 128915 is the total out_file_name = '../out/output-' + timeStamp + "-" + str(num) + '.txt' out_file = open(out_file_name, 'w') start = time() spark = init_spark() json_files = read_json_files(root_path, spark, num) data = get_body_text(spark, json_files) print("data reading done") # clean the data word_clean_up_F = F.udf(lambda x: clean_up(x), StringType()) data = data.withColumn("body_text_cleaned", word_clean_up_F("body_text")) data = data.select("body_text_cleaned") print("data processing done") tokenizer = Tokenizer(inputCol="body_text_cleaned", outputCol="words") token_DataFrame = tokenizer.transform(data) token_DataFrame = token_DataFrame.select("words") # Remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") cleaned_DataFrame = remover.transform(token_DataFrame) cleaned_DataFrame = cleaned_DataFrame.select('filtered') # Count vectorizer cv_tmp = CountVectorizer(inputCol="filtered", outputCol="count_features") cvmodel = cv_tmp.fit(cleaned_DataFrame) count_dataframe = cvmodel.transform(cleaned_DataFrame) count_dataframe = count_dataframe.select('count_features') # TF-IDF Vectorizer tfidf = IDF(inputCol="count_features", outputCol="features") tfidfmodel = tfidf.fit(count_dataframe) tfidf_dataframe = tfidfmodel.transform(count_dataframe).select("features") print("Ready to fit with the LDA model") # Fit the LDA Model num_topics = 5 max_iterations = 20 lda_start = time() lda = LDA(seed=1, optimizer="em", k=num_topics, maxIter=max_iterations) lda_model = lda.fit(tfidf_dataframe) lda_transformed = lda_model.transform(tfidf_dataframe) lda_end = time() print("LDA complete") # joblib.dump(lda_model, 'lda.csv') # Get terms per topic topics = lda_model.topicsMatrix() vocabArray = cvmodel.vocabulary wordNumbers = 15 # number of words per topic topicIndices = lda_model.describeTopics(maxTermsPerTopic=wordNumbers).rdd.map(tuple) topics_final = topicIndices.map(lambda topic: topic_render(topic, wordNumbers, vocabArray)).collect() for topic in range(len(topics_final)): print("Topic " + str(topic) + ":") print("Topic " + str(topic) + ":", file=out_file) print(topics_final[topic]) print(topics_final[topic], file=out_file) print("Full runtime : {} min. ".format((time() - start) / 60)) print("LDA runtime : {} min. ".format((lda_end - lda_start) / 60)) print("Check" + out_file.name) cleaned_DataFrame.cache() lda_transformed.cache() # Data Visualization data = format_data_to_pyldavis(cleaned_DataFrame, cvmodel, lda_transformed, lda_model) print("Preparing data with pyLDAvis ...") filter_bad_docs(data) py_lda_prepared_data = pyLDAvis.prepare(**data) file_name = '../out/data-viz-' + timeStamp + '.html' print("Saving pyLDAvis html page ...") pyLDAvis.save_html(py_lda_prepared_data, file_name) pyLDAvis.show(py_lda_prepared_data) spark.stop()
nb_features = max(nb_features_train, nb_features_test) print(nb_features) nb_features = 5000 print("\nDone : Tokenization training and test sets") ########################################################################### ######### TF IDF Training and Test Set ######### #training set hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=nb_features) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #rescaledData.select("label", "features").show() #test_set hashingTF_test = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=nb_features) featurizedData_test = hashingTF_test.transform(wordsData_test) # idf = IDF(inputCol="rawFeatures", outputCol="features") # idfModel = idf.fit(featurizedData_test) rescaledData_test = idfModel.transform(featurizedData_test) rescaled_test_df = rescaledData_test.select("features")
# stemmer = LancasterStemmer() # stemmer_udf = udf( # lambda tokens: [stemmer.stem(token) for token in tokens], # ArrayType(StringType()) # ) # data_df_stemmed = data_df_filtered.withColumn("wordsStemmed", stemmer_udf("words")) # # hashing term frequency # hashing_term_freq = \ # HashingTF(inputCol="wordsStemmed", outputCol="featuresRaw", numFeatures=5000) # data_df_tf = hashing_term_freq.transform(data_df_stemmed) # hashing term frequency hashing_term_freq = \ HashingTF(inputCol="words", outputCol="featuresRaw", numFeatures=5000) data_df_tf = hashing_term_freq.transform(data_df_filtered) # inverse document frequency inv_doc_freq = IDF(inputCol="featuresRaw", outputCol="features", minDocFreq=5) inv_doc_freq_fitted = inv_doc_freq.fit(data_df_tf) data_df_tfidf = inv_doc_freq_fitted.transform(data_df_tf) # encode classes indexer = StringIndexer(inputCol="category", outputCol="label") indexer_fitted = indexer.fit(data_df_tfidf) data_prepared_df = indexer_fitted.transform(data_df_tfidf) # predict log_reg_fitted = LogisticRegressionModel.load("output/reviews_model.model") test_pred_df = log_reg_fitted.transform(data_prepared_df)
+-----+--------------------+--------------------+ """ words_data.show(truncate=False) """ +-----+-----------------------------------+------------------------------------------+ |label|sentence |words | +-----+-----------------------------------+------------------------------------------+ |0.0 |Hi I heard about Spark |[hi, i, heard, about, spark] | |0.0 |I wish java could use case classes |[i, wish, java, could, use, case, classes]| |1.0 |Logistic regression models are neat|[logistic, regression, models, are, neat] | +-----+-----------------------------------+------------------------------------------+ """ hasing_tf=HashingTF(inputCol='words',outputCol='rawFeatures') featurized_data= hasing_tf.transform(words_data) idf=IDF(inputCol='rawFeatures',outputCol='features') idf_model=idf.fit(featurized_data) rescaled_data=idf_model.transform(featurized_data) rescaled_data.select('label','features').show() """ +-----+--------------------+ |label| features| +-----+--------------------+ | 0.0|(262144,[24417,49...| | 0.0|(262144,[20719,24...| | 1.0|(262144,[13671,91...| +-----+--------------------+ """ rescaled_data.select('label','features').show(truncate=False) """ +-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |label|features |
def text_clustering(dataFrame, k_value, w2v=False, w2v_value=None, seed=2137, normalize=True, plot=True): """ args: -dataFrame: spark Data Frame -k_value: number of clusters in k-means algorithm -w2v: if True word2Vec is used and w2v_value must be specified, otherwise tf-idf is used -w2v_value: number of parameters to be returned with Word2Vec -seed: seed -normalize: should normalization after Word2Vec be performed? -plot: if True, clusters are visualized with the use of PCA """ #Data preprocessing tokenizer = Tokenizer(inputCol="text", outputCol="words_raw") dataFrame = tokenizer.transform(dataFrame) remover = StopWordsRemover(inputCol="words_raw", outputCol="words") dataFrame = remover.transform(dataFrame) if w2v and w2v_value is None: raise ValueError('You have to give w2v_values parameter') if not w2v: #tf-idf hashingTF = HashingTF(inputCol="words_raw", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(dataFrame) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) memes_df = idfModel.transform(featurizedData) else: #word2vec word2Vec = Word2Vec(vectorSize=w2v_value, seed=seed, inputCol="words", outputCol="features_unnormalized") model_w2v = word2Vec.fit(dataFrame) memes_df = model_w2v.transform(dataFrame) model_w2v.write().overwrite().save("hdfs:///models/model_w2v") if normalize: scaler = StandardScaler(inputCol="features_unnormalized", outputCol="features", withStd=True, withMean=True) scalerModel = scaler.fit(memes_df) memes_df = scalerModel.transform(memes_df) #kmeans kmeans = KMeans(k=k_value, seed=seed) model_kmeans = kmeans.fit(memes_df) memes_df = model_kmeans.transform(memes_df) model_kmeans.write().overwrite().save("hdfs:///models/model_kmeans") #clustering evaluation evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(memes_df) centers = model_kmeans.clusterCenters() if plot: import matplotlib.pyplot as plt #virtual environment might have problems if imported "the classical" way #pca pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures") model_pca = pca.fit(memes_df) memes_df = model_pca.transform(memes_df) #memes_df.show() centers_pca = [None] * len(centers) for i in range(len(centers)): centers_pca[i] = np.multiply(model_pca.pc.toArray().T, centers[i]).sum(axis=1) centers_pca = np.array(centers_pca) #plot section split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()), ',') memes_df = memes_df.withColumn( 'x', translate(split_col.getItem(0), "[", "").cast(DoubleType())) memes_df = memes_df.withColumn( 'y', translate(split_col.getItem(1), "]", "").cast(DoubleType())) #memes_df.show(truncate = False) df = memes_df.toPandas() groups = df.groupby('prediction') fig, ax = plt.subplots() ax.margins(0.05) for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, label=name) ax.text(centers_pca[name, 0], centers_pca[name, 1], s=name, fontsize=10) ax.legend() ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format( k_value, w2v_value, silhouette)) plt.show() print("PCA, explained variance= {0}".format( model_pca.explainedVariance)) return memes_df
df_seg.show() # 将分词做成ArrayType() tokenizer = Tokenizer(inputCol='seg', outputCol='words') df_seg_arr = tokenizer.transform(df_seg).select('words', 'label') df_seg_arr.show() # 切词后的文本特征处理 tf = HashingTF(numFeatures=1 << 18, binary=False, inputCol='words', outputCol='rawfeatures') df_tf = tf.transform(df_seg_arr).select('rawfeatures', 'label') df_tf.show() idf = IDF(inputCol='rawfeatures', outputCol='features') idfModel = idf.fit(df_tf) df_tf_idf = idfModel.transform(df_tf) df_tf_idf.show() # label数据处理 stringIndexer = StringIndexer(inputCol='label', outputCol='indexed', handleInvalid='error') indexer = stringIndexer.fit(df_tf_idf) df_tf_idf_lab = indexer.transform(df_tf_idf).select('features', 'indexed') df_tf_idf_lab.show() # 切分训练集和预测集 splits = df_tf_idf_lab.randomSplit([0.7, 0.3], 123) train = splits[0] test = splits[1]
outputCol="bucketized_features") bucketed_df = bucketizer.transform(b_df) bucketed_df.show() # Text tokenization from pyspark.ml.feature import Tokenizer sentence_df = spark.createDataFrame( [(1, "Introduction to Spark MLlib"), (2, "MLlib includes libraries for classification and regression"), (3, "Also supports pipelines")], ["id", "sentence"]) sent_tokenizer = Tokenizer(inputCol="sentence", outputCol="words") sent_tokenized_df = sent_tokenizer.transform(sentence_df) print(sent_tokenized_df.show()) # TF-IDF from pyspark.ml.feature import HashingTF, IDF # TF hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=20) sent_hf_tf_df = hashingTF.transform(sent_tokenized_df) print(sent_hf_tf_df.take(1)) # IDF idf = IDF(inputCol="raw_features", outputCol="idf_features") idf_model = idf.fit(sent_hf_tf_df) tf_idf_df = idf_model.transform(sent_hf_tf_df)
"filtered").setOutputCol("features") model = word2vec.fit(train_set) train_set1 = model.transform(train_set) test_set1 = model.transform(test_set) # now use tf-idf hashingTF = HashingTF().setNumFeatures(1000).setInputCol( "filtered").setOutputCol("rawFeatures") idf = IDF().setInputCol("rawFeatures").setOutputCol( "features").setMinDocFreq(10) pipeline = Pipeline(stages=[hashingTF, idf]) train_set2 = pipeline.fit(train_set).transform(train_set) pipeline2 = Pipeline(stages=[hashingTF]) test_set2 = pipeline2.fit(train_set).transform( test_set) # use trainset idf to transform test set test_set2 = idf.fit(test_set2).transform(test_set2) # ====================================================================================== # Fit Model # ====================================================================================== def fit_nb(train): rf = RandomForestClassifier(numTrees=20, maxDepth=20, labelCol="label", seed=42) model = rf.fit(train) return model def get_predictions(model, test): result = model.transform( test.select('features')) # result is a DataFrame
# COMMAND ---------- from pyspark.ml.feature import HashingTF, IDF tf = HashingTF()\ .setInputCol("DescOut")\ .setOutputCol("TFOut")\ .setNumFeatures(10000) idf = IDF()\ .setInputCol("TFOut")\ .setOutputCol("IDFOut")\ .setMinDocFreq(2) # COMMAND ---------- idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False) # COMMAND ---------- from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. documentDF = spark.createDataFrame( [("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), )], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF)
minDF=10.0) result_cv = count_vec.fit(refined_df).transform(refined_df) # result_cv.select(['Clothing ID','refined_tokens','raw_features']).show(4,False) vocabArray = count_vec.fit(refined_df).vocabulary # Tf-idf # from pyspark.ml.feature import HashingTF,IDF # hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features') # hashing_df=hashing_vec.transform(refined_df) # hashing_df.select(['refined_tokens','tf_features']).show(4,False) tf_idf_vec = IDF(inputCol='tf_features', outputCol='features') tf_idf_df = tf_idf_vec.fit(result_cv).transform(result_cv) # tf_idf_df.select(['user_id','tf_idf_features']).show(4,False) # tf_idf_df.cache() tf_idf_df.persist(storageLevel=pyspark.StorageLevel.MEMORY_AND_DISK) from pyspark.ml.clustering import LDA, LDAModel num_topics = 10 max_iterations = 10 lda = LDA(k=num_topics, maxIter=max_iterations) # lda_model = lda.fit(tf_idf_df[['index','features']].rdd.map(list)) model = lda.fit(tf_idf_df) ll = model.logLikelihood(tf_idf_df)
Words_Rdd = tokenizer.transform(X_Rdd) # Display the first ten rows of the DataFrame with # three columns: label, message and the tokenized words Words_Rdd.show(10) # Apply CountVectorizer which CountVectorizer converts the word tokens # into vectors of token counts. token_counts = CountVectorizer (inputCol="words", outputCol="new_features") model =token_counts.fit(Words_Rdd) featurized_Rdd = model.transform(Words_Rdd) featurized_Rdd.show(10) # Apply Term Frequency–Inverse Document Frequency (TF-IDF) idf = IDF(inputCol="new_features", outputCol="features") idfModel = idf.fit(featurized_Rdd) rescaled_Rdd = idfModel.transform(featurized_Rdd) rescaled_Rdd.select("label", "features").show(10) # Split the dataset into Training data = 80% and Testing data = 20% # Set seed for reproducibility zero seed = 0 train_df, test_df = rescaled_Rdd.randomSplit([0.8,0.2],seed) # Number of records of each dataframe train_df.count() test_df.count() # FIT the Naïve Bayes classifier nb = NaiveBayes() paramGrid_nb = ParamGridBuilder().addGrid(nb.smoothing, np.linspace(0.3, 10, 5)).build()
#After tokenizer, we added one col(Words to table) words_Data = tokenizer_word.transform(reviews) #words_Data.show(5) #Count the number of words countvectorizer = CountVectorizer(inputCol="Words", outputCol="raw_features") model = countvectorizer.fit(words_Data) #Add count to our table get_count_data = model.transform(words_Data) #get_count_data.show(5) #calculate TF-IDF idf_value = IDF(inputCol="raw_features", outputCol="idf_value") idf_model = idf_value.fit(get_count_data) final_rescaled_data = idf_model.transform(get_count_data) #final_rescaled_data.show(5) # final_rescaled_data.select("idf_value").show() #vocabulary list vocabalary = model.vocabulary #Block 3 def extract(value): return { vocabalary[i]: float(tfidf_value) for (i, tfidf_value) in zip(value.indices, value.values) }
#Merge product with words fulldata = sqlContext.createDataFrame(fulldata.rdd.map((enlargeTokenAndClean))) print "words enlarge with desc and title" print fulldata.head() print "################" # Step 2: compute term frequencies hashingTF = HashingTF(inputCol="wordsF", outputCol="tf") fulldata = hashingTF.transform(fulldata) print "TERM frequencies:" print fulldata.head() print "################" # Step 3: compute inverse document frequencies idf = IDF(inputCol="tf", outputCol="tf_idf") idfModel = idf.fit(fulldata) fulldata = idfModel.transform(fulldata) print "IDF :" print fulldata.head() print "################" # Step 4 new features column / rename old fulldata = sqlContext.createDataFrame(fulldata.rdd.map(addFeatureLen)) fulldata = sqlContext.createDataFrame(fulldata.rdd.map(newFeatures)) print "NEW features column :" print fulldata.head() print "################" # Step 5: ALTERNATIVE ->ADD column with number of terms as another feature #fulldata = sqlContext.createDataFrame(fulldata.rdd.map(
# COMMAND ---------- from pyspark.ml.feature import HashingTF, IDF tf = HashingTF()\ .setInputCol("DescOut")\ .setOutputCol("TFOut")\ .setNumFeatures(10000) idf = IDF()\ .setInputCol("TFOut")\ .setOutputCol("IDFOut")\ .setMinDocFreq(2) # COMMAND ---------- idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False) # COMMAND ---------- from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. documentDF = spark.createDataFrame([ ("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), ) ], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF)
def main(): sc = SparkSession.builder.appName("SentencingAnalyzer")\ .config("spark.driver.memory", "10G")\ .getOrCreate() # main df cases = sc.read.json("../data/sentencingCases2.jsonl") df = cleanDf(cases) # read categorized csv categorizedCsv = sc.read.csv("../data/categorized.csv", header=True) categorizedCsv = categorizedCsv.select( 'caseName', f.split(f.col("type"), " - ").alias('offenseType'), 'duration1', 'sentenceType1') # create the search df df = extractOffenseKeywords(df) df.cache() dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"]) # CLASSIFICATION OF OFFENSE hashingTF = HashingTF(inputCol="offenseKeywords", outputCol="rawFeatures", numFeatures=1000) result = hashingTF.transform(df) resultSearch = hashingTF.transform(dfSearch) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(result) rescaledData = idfModel.transform(result).filter( f.size('offenseKeywords') > 0) idfModelSearch = idf.fit(resultSearch) rescaledDataSearch = idfModelSearch.transform(resultSearch) mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345, numHashTables=20) modelMH = mh.fit(rescaledData) transformedData = modelMH.transform(rescaledData) modelMHSearch = mh.fit(rescaledDataSearch) transformedDataSearch = modelMH.transform(rescaledDataSearch) categorizedDf = modelMHSearch.approxSimilarityJoin( transformedDataSearch, transformedData, 0.89, distCol="JaccardDistance") distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \ .orderBy('caseID', 'JaccardDistance') distanceDf = distanceDf.groupBy('caseID').agg( f.collect_list('term').alias('predictedOffences'), f.collect_list('JaccardDistance').alias('JaccardDistances')) distanceDf.cache() distanceDf.show() # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION distanceDfEval = distanceDf.join( categorizedCsv, distanceDf.caseID == categorizedCsv.caseName) distanceDfEval = distanceDfEval.filter( distanceDfEval.offenseType[0] != "N/A").filter( distanceDfEval.offenseType[0] != "multiple party sentence") calcuateDifferenceInPredictedVsActualOffences_udf = f.udf( calcuateDifferenceInPredictedVsActualOffences, FloatType()) distanceDfEval = distanceDfEval.withColumn( "error", calcuateDifferenceInPredictedVsActualOffences_udf( distanceDfEval.predictedOffences, distanceDfEval.offenseType)) calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf( calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType()) distanceDfEval = distanceDfEval.withColumn( "pctCorrect", calcuateDifferenceInPredictedVsActualOffencesPercentage_udf( distanceDfEval.predictedOffences, distanceDfEval.offenseType)) distanceDfEval.select('caseID', 'predictedOffences', 'offenseType', 'JaccardDistances', 'error', 'pctCorrect').show(200, truncate=False) rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] / distanceDfEval.count())**(1.0 / 2) print("Offense category RMSE:", rmse) pctCorrectOffense = (distanceDfEval.groupBy().agg( f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100 print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
def login(): message = '' e_result = '' s_result = '' t_result = '' j_result = '' if request.method == 'POST': post = request.form.get('text') # access the data inside if len(post) >= 100: test = pd.DataFrame([post], columns=['post']) newrows = [] def filter_text(post): """Decide whether or not we want to use the post.""" # should remove link only posts here return len(post) > 0 reg_punc = re.compile('[%s]' % re.escape(string.punctuation)) def preprocess_text(post): """Remove any junk we don't want to use in the post.""" # Remove links post = re.sub(r'http\S+', '', post, flags=re.MULTILINE) # All lowercase post = post.lower() # Remove puncutation post = reg_punc.sub('', post) return post def create_new_rows(row): posts = row['post'] rows = [] # for p in posts: p = preprocess_text(posts) rows.append({'post': p}) return rows for index, row in test.iterrows(): newrows += create_new_rows(row) test = pd.DataFrame(newrows) df = spark.createDataFrame(test) # Create a length column to be used as a future feature df = df.withColumn('length', length(df['post'])) types = [ 'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ' ] types = [x.lower() for x in types] tokenizer = Tokenizer(inputCol="post", outputCol="words") tokenized = tokenizer.transform(df) # Remove stop words stopwordList = types stopwordList.extend(StopWordsRemover().getStopWords()) stopwordList = list(set(stopwordList)) #optionnal remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwordList) newFrame = remover.transform(tokenized) # Run the hashing term frequency hashing = HashingTF(inputCol="filtered", outputCol="hashedValues") # Transform into a DF hashed_df = hashing.transform(newFrame) # Fit the IDF on the data set idf = IDF(inputCol="hashedValues", outputCol="idf_token") idfModel = idf.fit(hashed_df) rescaledData = idfModel.transform(hashed_df) # Create feature vectors #idf = IDF(inputCol='hash_token', outputCol='idf_token') clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') output = clean_up.transform(rescaledData) ei_model = NaiveBayesModel.load("static/models/EI_Predictor.h5") sn_model = NaiveBayesModel.load("static/models/SN_Predictor.h5") tf_model = NaiveBayesModel.load("static/models/TF_Predictor.h5") jp_model = NaiveBayesModel.load("static/models/JP_Predictor.h5") test_e = ei_model.transform(output) e = test_e.toPandas()["prediction"].values[0] if e == 0: e_result = "I" else: e_result = "E" test_s = sn_model.transform(output) s = test_s.toPandas()["prediction"].values[0] if s == 0: s_result = "N" else: s_result = "S" test_t = tf_model.transform(output) t = test_t.toPandas()["prediction"].values[0] if t == 0: t_result = "F" else: t_result = "T" test_j = jp_model.transform(output) j = test_j.toPandas()["prediction"].values[0] if j == 0: j_result = "P" else: j_result = "J" else: message = "Please tell us more about yourself!" return render_template('index.html', message=message, test_e=e_result, test_s=s_result, test_t=t_result, test_j=j_result)
text = remover.transform(text) text.show(5) ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams') text = ngramer.transform(text) text.show(5) count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(), outputCol='ft_features') count_vec_model = count_vec.fit(text) vocab = count_vec_model.vocabulary text = count_vec_model.transform(text) text.show(5) idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features') text = idf.fit(text).transform(text) lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10) lda_model = lda.fit(text) topics = lda_model.describeTopics() # topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect() get_topics_words = F.udf(lambda x: [vocab[i] for i in x], ArrayType(StringType())) topics = topics.withColumn('topic_words', get_topics_words(F.col('termIndices'))) topics.show() text = lda_model.transform(text) text.show(5) '''
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster( 'spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label, title), ], ['label', "title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request, {'resultList': resultList})
spark_df = spark_df.drop('count') spark_df = spark_df.selectExpr("predicted_category as predicted_category_table", "predict_score as predict_score") #Tokenizing and Vectorizing tok = Tokenizer(inputCol="cleaned_hm", outputCol="words") review_tokenized = tok.transform(spark_train) stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw') review_tokenized = stopword_rm.transform(review_tokenized) cv = CountVectorizer(inputCol='words_nsw', outputCol='tf') cvModel = cv.fit(review_tokenized) count_vectorized = cvModel.transform(review_tokenized) idf_ngram = IDF().setInputCol('tf').setOutputCol('tfidf') tfidfModel_ngram = idf_ngram.fit(count_vectorized) tfidf_df = tfidfModel_ngram.transform(count_vectorized) word_indexer_pc = StringIndexer(inputCol="predicted_category", outputCol="predicted_category_new", handleInvalid="error") #Splitting the training data into training data and validation data splits = tfidf_df.randomSplit([0.8,0.2],seed=100) train = splits[0] val = splits[1] #Building the pipeline for the model hm_assembler = VectorAssembler(inputCols=[ "tfidf"], outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,labelCol="predicted_category_new",featuresCol = "features") hm_pipeline = Pipeline(stages=[hm_assembler, word_indexer_pc, lr]) #To get the best paramter values using CrossValidator
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tf_idf_sample") \ .master("local[*]") \ .getOrCreate() df1 = spark.createDataFrame([(0, "a a a b b c"), (0, "a b c"), (1, "a c a a d")]).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") # 각 문장을 단어로 분리 df2 = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20) df3 = hashingTF.transform(df2) df3.cache() idf = IDF(inputCol="TF-Features", outputCol="Final-Features") idfModel = idf.fit(df3) rescaledData = idfModel.transform(df3) rescaledData.select("words", "TF-Features", "Final-Features").show() spark.stop
def add_tfidf(df): idf = IDF(inputCol="tf_vector", outputCol="tfidf_vector") idf_model = idf.fit(df) df_tfidf = idf_model.transform(df) return df_tfidf
def lda_optimal(self, preprocess_file=DEFAULT_PREPROCESSING_OUTPUT, cluster_df=CLUSTER_DF, maxiter=MAXITER, output_file_name=DEFAULT_OUTPUT_FILE, max_term_tagging=m): filter_number_udf = udf( lambda row: [x for x in row if not self.is_digit(x)], ArrayType(StringType())) temp = sqlContext.read.parquet(preprocess_file) temp = temp.withColumn('no_number_vector_removed', filter_number_udf(col('vector_no_stopw'))) temp1 = temp.select(temp.paper_id, explode(temp.no_number_vector_removed)) temp2 = temp1.filter(temp1.col != "") temp3 = temp2.groupby("paper_id").agg( F.collect_list("col").alias("vector_removed")) inner_join = temp3.join(temp, ["paper_id"]) windowSpec = Window.orderBy(F.col("paper_id")) df_final = inner_join.withColumn("id", F.row_number().over(windowSpec)) df_txts = df_final.select("vector_removed", "id", "paper_id", "doi", "title", "authors", "abstract", "abstract_summary", "vector_no_stopw") df = sqlContext.read.format("com.databricks.spark.csv").option( "header", "true").option("inferschema", "true").option("mode", "DROPMALFORMED").load("CLUSTER_DF") df_txts = df.join(df_txts, "paper_id" == "index") # TF cv = CountVectorizer(inputCol="vector_removed", outputCol="raw_features", vocabSize=5000, minDF=5.0) cvmodel = cv.fit(df_txts) result_cv = cvmodel.transform(df_txts) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType spark = SparkSession.builder.appName( 'SparkByExamples.com').getOrCreate() schema = StructType([ StructField('cluster_id', StringType(), True), StructField('tagging', ArrayType(), True) ]) topic_modeling = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema) distinct_clusters = result_tfidf.select( "cluster_id").distinct().sorted().collect_list() for i in distinct_clusters: subset = result_tfidf.filter(result_tfidf.cluster_id == i) lda = LDA(k=1, maxIter=100) ldaModel = lda.fit(result_subset) output = ldaModel.transform(result_tfidf) if (i == 0): full_df = output else: full_df = full_df.union(output) topics = ldaModel.describeTopics(maxTermsPerTopic=m) vocabArray = cvmodel.vocabulary ListOfIndexToWords = udf( lambda wl: list([vocabArray[w] for w in wl])) FormatNumbers = udf(lambda nl: ["{:1.4f}".format(x) for x in nl]) taggings = topics.select( ListOfIndexToWords(topics.termIndices).alias('words')) temp = spark.createDataFrame([(i, taggings)], ['cluster_id', 'taggings']) topic_modeling = topic_modeling.union(temp) # output the taggings of each topic topic_modeling.to_csv(output_file_name) return full_df
def makeWord2VecModel(): cursor = News.find({}) text = "" for news in cursor: text += news['text'] with open(os.path.join(os.getcwd(), 'word2Vec.txt'), 'w', encoding='utf-8') as inputFile: inputFile.writelines(text) spark = SparkSession.builder.appName("SimpleApplication").getOrCreate() # Построчная загрузка файла в RDD input_file = spark.sparkContext.textFile('word2Vec.txt') print(input_file.collect()) prepared = input_file.map(lambda x: ([x])) df = prepared.toDF() prepared_df = df.selectExpr('_1 as text') # Разбить на токены tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) # Удалить стоп-слова stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(words) # Вывести стоп-слова для русского языка print(stop_words) # Вывести таблицу filtered filtered.show() # Вывести столбец таблицы words с токенами до удаления стоп-слов words.select('words').show(truncate=False, vertical=True) # Вывести столбец "filtered" таблицы filtered с токенами после удаления стоп-слов filtered.select('filtered').show(truncate=False, vertical=True) # Посчитать значения TF vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() vocabulary = vectorizer.vocabulary # Вывести таблицу со значениями частоты встречаемости термов. featurized_data.show() # Вывести столбец "raw_features" таблицы featurized_data featurized_data.select('raw_features').show(truncate=False, vertical=True) # Вывести список термов в словаре print(vocabulary) # Посчитать значения DF idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Вывести таблицу rescaled_data rescaled_data.show() # Вывести столбец "features" таблицы featurized_data rescaled_data.select('features').show(truncate=False, vertical=True) # Построить модель Word2Vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='words', outputCol='result') model = word2Vec.fit(words) w2v_df = model.transform(words) w2v_df.show() persons = [] cPersons = db.Persones.find({}) for secName in cPersons: persons.append(secName['sName']) synonyms = [] i = 0 synonyms.append(model.findSynonyms('погибла', 2)) for word, cosine_distance in synonyms: print(str(word)) spark.stop()
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. Remove stop words meaningful_words = [w for w in words if not w in stops] # # 5. Join the words back into one string separated by space, # and return the result. return " ".join( meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features")
def main(sc, sqlContext): start = timer() stpwrds = stopwords.words('english') tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N')) print '---Pegando produtos---' start_i = timer() productRDD = sc.parallelize(findProductsByCategory([])) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3])) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] )) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Pegando e persistindo dados de categoria e tokens---' start_i = timer() tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect() numTokens = len(tokens) category = productRDD.map(lambda x: x[2]).distinct().collect() categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect() insertTokensAndCategories(tokens, category, categoryAndSubcategory) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos produtos---' start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataDF = sqlContext.createDataFrame(wordsData) #persistindo para a predicao wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction) if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features)) VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo Naive Bayes---' start_i = timer() model = NaiveBayes.train(VSMTrain) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria") model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') print '####levou %d segundos' % (timer()-start_i) print '---Testando modelo Naive Bayes---' start_i = timer() prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)])) acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print '---Pegando os posts---' start_i = timer() posts = list() wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx') sheet = wb['Menes'] for row in sheet.iter_rows(row_offset=1): post = list() for cell in row: if cell.value is None: break post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value)) if len(post) > 0: posts.append(tuple(post)) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() postsRDD = sc.parallelize(posts) postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower()))) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds])) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos Posts---' start_i = timer() wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1])) wordsDataDF = sqlContext.createDataFrame(wordsData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo SVM---' start_i = timer() model = SVMWithSGD.train(VSMTrain, iterations=100) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm") model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") print '---Testando modelo SVM---' start_i = timer() prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features))) acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print 'O processo todo levou %d segundos' % (timer()-start)
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tf_idf_sample") \ .master("local[*]") \ .getOrCreate() df1 = spark.createDataFrame([ (0, "a a a b b c"), (0, "a b c"), (1, "a c a a d")]).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") # 각 문장을 단어로 분리 df2 = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20) df3 = hashingTF.transform(df2) df3.cache() idf = IDF(inputCol="TF-Features", outputCol="Final-Features") idfModel = idf.fit(df3) rescaledData = idfModel.transform(df3) rescaledData.select("words", "TF-Features", "Final-Features").show() spark.stop
) train.filter(F.col('toxic') == 1).show(5) print( '2============================================================================================' ) tokenizer = Tokenizer(inputCol="comment_text", outputCol="words") wordsData = tokenizer.transform(train) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") tf = hashingTF.transform(wordsData) tf.select('rawFeatures').take(2) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) tfidf.select("features").first() REG = 0.1 lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=REG) print( '5============================================================================================' ) tfidf.show(5) print( '5============================================================================================' )
def main(inputs): amazon_schema = types.StructType([ types.StructField('marketplace', types.StringType()), types.StructField('customer_id', types.IntegerType()), types.StructField('review_id', types.StringType()), types.StructField('product_id', types.StringType()), types.StructField('product_parent', types.LongType()), types.StructField('product_title', types.StringType()), types.StructField('product_category', types.StringType()), types.StructField('star_rating', types.IntegerType()), types.StructField('helpful_votes', types.IntegerType()), types.StructField('total_votes', types.IntegerType()), types.StructField('vine', types.StringType()), types.StructField('verified_purchase', types.StringType()), types.StructField('review_headline', types.StringType()), types.StructField('review_body', types.StringType()), types.StructField('review_date', types.DateType()) ]) input_df = spark.read.parquet(inputs) input_df = input_df.repartition(96) #input_df.show() #print("No of rows in input dataset:",inputs," is:",input_df.count()) StopWords = stopwords.words("english") start_time = time.time() tokens = input_df.rdd.map(lambda x: x['review_headline'])\ .filter(lambda x: x is not None)\ .map( lambda document: document.strip().lower())\ .map( lambda document: re.split(" ", document))\ .map( lambda word: [x for x in word if x.isalpha()])\ .map( lambda word: [x for x in word if len(x) > 3] )\ .map( lambda word: [x for x in word if x not in StopWords])\ .zipWithIndex() df_txts = spark.createDataFrame(tokens, ["list_of_words", 'index']) # TF cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0) cvmodel = cv.fit(df_txts) result_cv = cvmodel.transform(df_txts) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) #result_tfidf.show() num_topics = 10 max_iterations = 100 lda = LDA(k=num_topics, maxIter=max_iterations) lda_model = lda.fit(result_tfidf.select('index', 'features')) wordNumbers = 5 #topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers)) topics = lda_model.describeTopics(maxTermsPerTopic=wordNumbers) topics.show(truncate=False)
cv = CountVectorizer(inputCol='words_filtered', outputCol='BoW', minDF=2.0) cv_model = cv.fit(train_filtered) train_data = cv_model.transform(train_filtered) dev_data = cv_model.transform(dev_filtered) test_data = cv_model.transform(test_filtered) # TODO: Print the vocabulary size (to STDOUT) after filtering out stopwords and very rare tokens # Hint: Look at the parameters of CountVectorizer # [FIX ME!] Write code below print("length of all the vocabulary after stopwrods = ", len(cv_model.vocabulary)) # Create a TF-IDF representation of the data idf = IDF(inputCol='BoW', outputCol='TFIDF') idf_model = idf.fit(train_data) train_tfidf = idf_model.transform(train_data) dev_tfidf = idf_model.transform(dev_data) test_tfidf = idf_model.transform(test_data) # ----- PART III: MODEL SELECTION ----- # Provide information about class labels: needed for model fitting # Only needs to be defined once for all models (but included in all pipelines) label_indexer = StringIndexer(inputCol='class_label', outputCol='label') # Create an evaluator for binary classification # Only needs to be created once, can be reused for all evaluation evaluator = BinaryClassificationEvaluator() # Train a decision tree with default parameters (including maxDepth=5)
udf_cleansing_and_tokenizing = functions.udf(cleansing_and_tokenizing) test_data = test_data.withColumn("tweet_cleansed", udf_cleansing_and_tokenizing(functions.col("tweet"))) from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words") test_data = tokenizer.transform(test_data) from pyspark.ml.feature import HashingTF hashingTF = HashingTF(inputCol="words", outputCol="term_freq") test_data = hashingTF.transform(test_data) test_data.show(5) from pyspark.ml.feature import IDF idf = IDF(inputCol="term_freq", outputCol="tfidf") idfModel = idf.fit(test_data) test_data = idfModel.transform(test_data) test_data.show(5) from pyspark.ml.feature import StringIndexer stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex") model = stringIndexer.fit(test_data) test_data = model.transform(test_data) test_data.show(5) predicted = test_data.select("tfidf", "labelIndex") predicted.show(5) model_folder = os.path.join(os.getcwd(), 'saved_models') model_full_path = os.path.join(model_folder, "twitter_sentiment_spark") if not os.path.exists(model_folder):
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3])) .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3])) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0)) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = (tfIDF .filter(tfIDF.type==u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features))) .filter(lambda p: p[2]==1) .map(lambda p: (p[0], p[1])) .groupByKey() .mapValues(list) .collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type==category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf .map(lambda x: (post, x.label, cossine(x.features, postVector))) .filter(lambda x: x[2]>=threshold) .collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words','normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF) dfTrainTFIDF = idfModel.transform(dfTrainTF) dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show() # Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review','label','target_indexed').show() #********************************************************************** #-----------Training the model for prediction-------------------------- #**********************************************************************