def sample_tf_idf(self, dataRDD, nd_idf, agg_idf): dataDF = dataRDD.map(lambda i: Row( **{ 'salary': int(i.salary), 'agg': [i.education] + [i.city] + [i.work_lable] + [i.work_exp], 'name_and_desp': desp_text_division(i.name + ',' + i.work_desp) })).toDF() dataDF.show() ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240) aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256) data = ndtf.transform(dataDF) data = aggtf.transform(data) data.show() idfdata = nd_idf.transform(data) idfdata = agg_idf.transform(idfdata) idfdata.select('salary', 'ndfeatures', 'features_agg') RDD = idfdata.rdd featuresRDD = RDD.map(lambda i: (i.salary, i.ndfeatures.toArray( ).tolist() + i.features_agg.toArray().tolist())) return featuresRDD
def extract_featrues(self, train_rdd=None, test_rdd=None): """ train_rdd: type rdd, the raw rdd of train data (text content, label) test_rdd: type rdd, the raw rdd of test data (text content, doc_id) return: type data frame, a data frame where each record contains the extracred features """ print('****************************') print('Feature Extraction: TF-IDF\n') train_raw_df = train_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'label']) test_raw_df = test_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'doc_id']) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") train_ngram_df = ngram.transform(train_raw_df).drop('words') test_ngram_df = ngram.transform(test_raw_df).drop('words') hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features') train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop( 'ngrams') test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop( 'ngrams') idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(train_raw_featured_data) train_df = idf_model.transform(train_raw_featured_data).drop( 'raw_features') test_df = idf_model.transform(test_raw_featured_data).drop( 'raw_features') return (train_df, test_df)
def vectorize(preprocessed_df, incl_idf=False): """ Generate Feature Vectors from the Pre-processed corpus using the hashingTF transformer on the filtered, stemmed and normalised list of Tokens """ # Generate Term Frequency Feature Vectors by passing the sequence of tokens to the HashingTF Transformer. # Then fit an IDF Estimator to the Featurized Dataset to generate the IDFModel. # Finally pass the TF Feature Vectors to the IDFModel to scale based on frequency across the corpus if incl_idf: hashing_tf = HashingTF(inputCol="tokens", outputCol="raw_features", numFeatures=280) features_df = hashing_tf.transform(preprocessed_df) idf = IDF(inputCol="raw_features", outputCol="features") idf_model = idf.fit(features_df) scaled_features_df = idf_model.transform(features_df) return scaled_features_df else: hashing_tf = HashingTF(inputCol="tokens", outputCol="features", numFeatures=280) features_df = hashing_tf.transform(preprocessed_df) # Return the final vectorized DataFrame return features_df
def get_top_N(sc, major, minor, inputtext, N=5): # load TF-IDF feature #idfmodel = IDFModel.load("file:///Users/nileshbhoyar/Documents/Docker/idfmodel") idfmodel = load_model(sc, major, minor) df = load_data(sc, major, minor) raw_df = df.select("appl_doc_number", "claim_text") tokenizer = Tokenizer(inputCol="claim_text", outputCol="words") wordsData = tokenizer.transform( raw_df.dropna(how="any", subset="claim_text")) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) rescaledData = idfmodel.transform(featurizedData) model = rescaledData.select("appl_doc_number", "features") # prepare candidate input text sqlContext = SQLContext(sc) candidate_raw = sqlContext.createDataFrame( [(9999999, inputtext)], ["appl_doc_number", "claim_text"]) tokenizer = Tokenizer(inputCol="claim_text", outputCol="words") candidate = tokenizer.transform(candidate_raw) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") candidateTf = hashingTF.transform(candidate) candidateTfIdf = idfmodel.transform(candidateTf) # load child models for similarity calculations. #model = load_model(sc,major, minor) # find similarities result = find_similar(model, candidateTfIdf, N) top = sorted(result, key=lambda x: -x[1])[0:N] return get_claim_text(sc, [int(i[0]) for i in top], major, minor, df)
def processing1(request): if request.method!='POST': return HttpResponseRedirect('/sp/processing') else: progress=request.POST.get('progress') sc = SparkContext('local', 'test') spark = SparkSession.builder.getOrCreate() if progress=='1': city = request.POST.get('city') edu = request.POST.get('education') introduce = request.POST.get('introduce') position = request.POST.get('job') exp = request.POST.get('exp') print(city,edu,introduce,position,exp) explain1='第一步:将信息按照类别形成RDD后,通过map操作,将传入的信息进行合并,转化,并根据词频进行分词处理,后形成dataframe,便于下一步操作,此时信息的状态如下' dataRDD = sc.parallelize([[edu, city, position, exp, introduce]]) dataDF = dataRDD.map(lambda i: Row(**{ 'education': i[0], 'work_area': i[1], 'work_lable': i[2], 'work_exp': i[3], 'work_desp': i[4] })).map(lambda i: Row(**{ 'education': str(new_edu_trans(i.education)), 'city': [i.work_area], 'work_desp': i.work_desp, 'work_lable': [i.work_lable], 'work_exp': [i.work_exp] })).map(lambda i: Row(**{ 'agg': [i.education] + i.city + i.work_lable + i.work_exp, 'name_and_desp': desp_text_division(i.work_desp) })).toDF() dct={} for i in dataDF.collect(): dct['agg1']=i[0] dct['nd1']=i[1] # spark.stop() # sc.stop() agg_pro1='将学历,城市,职位,经验合并为一行:' nd_pro1='将个人简介单独为一行:' agg_pro2='学历,经验,职位,城市形成的向量:' nd_pro2='个人介绍形成的向量:' explain2='第二步:将形成的list通过spark的机器学习包转化包通过if-idf算法形成特征向量,便于下一步机器学习的使用' nd_idf = IDFModel.load('hdfs://localhost:9000/nd_idf_test') agg_idf = IDFModel.load('hdfs://localhost:9000/agg_idf_test') ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240) aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256) data = ndtf.transform(dataDF) data = aggtf.transform(data) idfdata = nd_idf.transform(data) idfdata = agg_idf.transform(idfdata) for i in idfdata.collect(): dct['agg2']=i[3] dct['nd2']=i[2] spark.stop() sc.stop() return render(request,'processing1.html',{'data':dct,'explain1':explain1,'agg_pro1':agg_pro1,'nd_pro1':nd_pro1, 'agg_pro2':agg_pro2,'nd_pro2':nd_pro2,'explain2':explain2})
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def get_product_similarity(self): """ Calculate the similarity between items/users """ product_taxonomy = self.data.select(self.productCol, self.taxonomyCol).distinct() product_taxonomy = self.__data_manipulation(product_taxonomy) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(product_taxonomy) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) col1 = "i." + self.productCol col2 = "j." + self.productCol dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType()) result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\ .select( col(col1).alias("i"), col(col2).alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j") result = result.filter(result.i < result.j & result.dot > 0.5) return result
def __data_manipulation(self, col): data = self.data.select(col, self.taxonomyCol).distinct() data = data.withColumn(self.taxonomyCol, data[self.taxonomyCol].cast(StringType())) concat_list = udf(lambda lst: ", ".join(lst), StringType()) data = data.groupby(col).agg( collect_list(self.taxonomyCol).alias(self.taxonomyCol)) data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol)) data = data.withColumn( self.taxonomyCol, split(regexp_replace(self.taxonomyCol, " ", ""), ',')) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(data) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) return norma_data
def tf_idf_usecase(): spark = getSparkSession() sentenceData = spark.createDataFrame( [(0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"]) """ Tokenizer:分词器 """ tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) wordsData.show(truncate=False) """ HashinfTF:将words列的所有文本转换为词袋进行表示 """ hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) featurizedData.select("words", "rawFeatures").show(truncate=False) """ TF-IDF:TF=>单词在单篇文档中出现的频率 IDF=>(文档数+1)/(出现单词的文档数+1)取对数,文档数是固定的, 单词出现的文档数越多,说明该词的重要性越低 """ idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show(truncate=False)
def run_minhash_lsh(): df = util.read_all_json_from_bucket(sql_context, config.S3_BUCKET_BATCH_PREPROCESSED) mh = MinHashLSH(inputCol="text_body_vectorized", outputCol="min_hash", numHashTables=config.LSH_NUM_BANDS) # Vectorize so we can fit to MinHashLSH model htf = HashingTF(inputCol="text_body_stemmed", outputCol="raw_features", numFeatures=1000) htf_df = htf.transform(df) vectorizer = VectorAssembler(inputCols=["raw_features"], outputCol="text_body_vectorized") vdf = vectorizer.transform(htf_df) if (config.LOG_DEBUG): print(colored("[MLLIB BATCH]: Fitting MinHashLSH model...", "green")) model = mh.fit(vdf) # Compute pairwise LSH similarities for questions within tags if (config.LOG_DEBUG): print( colored( "[BATCH]: Fetching questions in same tag, comparing LSH and MinHash, uploading duplicate candidates back to Redis...", "cyan")) find_dup_cands_within_tags(model)
def tfidf_lda(df): ''' TFIDF+LDA :param df: :return: model ''' # hashingTF hashingTF = HashingTF(inputCol="content", outputCol="features") df_TF = hashingTF.transform(df) print('df_TF') df_TF.show(truncate=False) # IDF idf = IDF(inputCol="features", outputCol="idf") model_idf = idf.fit(df_TF) df_idf = model_idf.transform(df_TF) print('df_idf') df_idf.cache() df_idf.show(truncate=False) # LDA lda = LDA(k=20, seed=1, optimizer="em") model_lda = lda.fit(df_idf) model_lda.describeTopics(maxTermsPerTopic=20) df_lda = model_lda.transform(df_idf) df_lda.select("content", "topicDistribution").show(truncate=False) sparkEntrance.spark.createDataFrame(df_lda.rdd, ['content', 'topicDistribution'])
def vectorize(self, df, n_features=16): ''' generates vectorized features from the self.traindf dataframe -------- Parameters df: spark dataframe - object to be featurized n_features: int - max number of words to be used as features -------- Returns None - Vectorized and rescaled data. ''' self.spark.udf.register('listjoin', lambda x: ' '.join(x)) remover = StopWordsRemover(inputCol="content", outputCol="filtered") df_lab_stopped = remover.transform(df) df_lab_stopped.registerTempTable('df_lab_stopped') stop_strings = self.spark.sql(''' SELECT listjoin(filtered) as filtered, content, label FROM df_lab_stopped ''') tokenizer = Tokenizer(inputCol="filtered", outputCol="words") wordsData = tokenizer.transform(stop_strings) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=n_features) featurizedData = hashingTF.transform(wordsData) featurizedData.cache() idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) return rescaledData
def test(spark): sc = spark.sparkContext tokenizer = Tokenizer(inputCol="sentence", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8000) idf = IDF(inputCol="rawFeatures", outputCol="features") srcdf = sc.textFile('predict.csv').map(parse_line) testing = srcdf.toDF() model = DecisionTreeClassificationModel.load('Bayes20000') testWordsData = tokenizer.transform(testing) testFeaturizedData = hashingTF.transform(testWordsData) testIDFModel = idf.fit(testFeaturizedData) testRescaledData = testIDFModel.transform(testFeaturizedData) testRescaledData.persist() testDF = testRescaledData.select("features", "label").rdd.map( lambda x: Row(label=float(x['label']), features=Vectors.dense(x['features']))).toDF() predictions = model.transform(testDF) predictions.select('prediction').write.csv(path='submit', header=True, sep=',', mode='overwrite') evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("The accuracy on test-set is " + str(accuracy))
def pipeline(df): print(df.head()) df = df.withColumn("length", length(df['Speech'])) # Create the data processing pipeline functions here (note: StringIndexer will be used to encode # your target variable column. This column should be named 'label' so our model will recognize it later) review_data = Tokenizer(inputCol="Speech", outputCol="Words") reviewed = review_data.transform(df) #reviewed.show() remover = StopWordsRemover(inputCol="Words", outputCol="filtered") newFrame = remover.transform(reviewed) #newFrame.show() hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2, 10)) # Transform in a DF hashed_df = hashing.transform(newFrame) hashed_df.show(truncate=False) idf = IDF(inputCol="hashedValues", outputCol="feature") idfModel = idf.fit(hashed_df) rescaledData = idfModel.transform(hashed_df) rescaledData.select("words", "feature").show(truncate=False) # indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label") # indexed = indexer.fit(rescaledData).transform(rescaledData) assembler = VectorAssembler(inputCols=["feature", "length"], outputCol="features") return assembler.transform(rescaledData)
def create_TFIDF_v0(trainData, applyData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20): tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words") wordsData1 = tokenizer.transform(trainData) wordsData2 = tokenizer.transform(applyData) remover = StopWordsRemover(inputCol="z_words", outputCol="z_filtered", stopWords=STOPWORDS_v0) wordsDataFiltered1 = remover.transform(wordsData1) wordsDataFiltered2 = remover.transform(wordsData2) hashingTF = HashingTF(inputCol="z_filtered", outputCol="z_rawFeatures", numFeatures=numFeatures) featurizedData1 = hashingTF.transform(wordsDataFiltered1) featurizedData2 = hashingTF.transform(wordsDataFiltered2) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="z_rawFeatures", outputCol=outputCol, minDocFreq=minDocFreq) idfModel = idf.fit(featurizedData1) rescaledData = idfModel.transform(featurizedData2) return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures", inputCol)
def get_feature(dataframe=df_train_x, nFeature=200): # convert the input string to lowercase and then split it by regex pattern regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") words_data = regexTokenizer.transform(dataframe) #count_tokens = udf(lambda words: len(words), IntegerType()) # count the number of words in each review #words_data.select("words").withColumn("tokens", count_tokens(col("words"))).show(5,truncate=True) # remove stop words (e.g the, who, which, at, on, I) stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="words_removed") words_removed_data = stopWordsRemover.transform(words_data) #count_tokens_new = udf(lambda words_removed: len(words_removed), IntegerType()) #words_removed_data.select("words_removed").withColumn("tokens_new", count_tokens_new(col("words_removed"))).show(5,truncate=True) # transform input features into n-grams #nGram = NGram(n=2, inputCol="words_removed", outputCol="ngrams") #ngrams_data = nGram.transform(words_removed_data) # transform list of words to words frequency vectors hashingTF = HashingTF(inputCol="words_removed", outputCol="words_freq", numFeatures=nFeature) words_freq_data = hashingTF.transform(words_removed_data) #words_freq_data.select("words_freq").show(5,truncate=True) # compute the IDF vector and scale words frequencies by IDF idf = IDF(inputCol="words_freq", outputCol="features") idf_model = idf.fit(words_freq_data) feature_data = idf_model.transform(words_freq_data).select("features") return feature_data
def get_word(text): # 문장을 단어 단위로 쪼갬 tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(text) #wordsData.show() # # 불용어 제거 # remover = StopWordsRemover() \ # .setStopWords(mystopwords) \ # .setCaseSensitive(False) \ # .setInputCol("words") \ # .setOutputCol("filtered") # remover.transform(wordsData).show() # tf 벡터화 과정- HashingTF to hash the sentence into a feature vector. hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=50) featurizedData = hashingTF.transform(wordsData) featurizedData.show() # idf 벡터화 과정 - IDF to rescale the feature vectors # idf = IDF(inputCol="rawFeatures", outputCol="features") # idfModel = idf.fit(featurizedData) # fit 명령어를 통해서 text 변수에 저장된 데이터를 학습 # tf-idf 벡터화 최종 결과 # rescaledData = idfModel.transform(featurizedData) # rescaledData.show() result = featurizedData.select('words', 'rawFeatures').rdd.map(lambda x: x) for i in result.collect(): print(i) # $example off$ return True
def calculate_hashingtf_idf(self, files_df): hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=262144) featurized_data = hashing_tf.transform(files_df) idf = IDF(inputCol="rawFeatures", outputCol="features") idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) return rescaled_data
def feature_engineering(class_balancedDf): # N-Gram ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams") ngramDataFrame = ngram.transform(class_balancedDf) # Hashing TF hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # K-Means kmeans = KMeans().setK(6).setSeed(1) kmodel = kmeans.fit(rescaledData).transform(rescaledData) #LDA lda = LDA(k=10, maxIter=10) ldamodel = lda.fit(kmodel).transform(kmodel) # changing label column to int data = ldamodel.withColumn( "label", ldamodel.label.cast("Integer")).drop("prediction") return data
def tf_idf(words): hashing_tf = HashingTF(numFeatures=1000, inputCol="words", outputCol="tf") tf = hashing_tf.transform(words) tf.cache() idf = IDF(minDocFreq=3, inputCol="tf", outputCol="features") model = idf.fit(tf) idf_res = model.transform(tf) return idf_res
def __preprocess_tdfidf(self, df: DataFrame): hashingTF = HashingTF().setInputCol("preprocessedData").setOutputCol( "tf").setNumFeatures(1500000) idf = IDF().setInputCol("tf").setOutputCol("features") df = hashingTF.transform(df) df_model = idf.fit(df) df = df_model.transform(df) return df
def tf_idf_feature(wordsData): hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) for features_label in rescaledData.select("features", "id").take(3): print(features_label)
def ece_idf(self, mergeRDD): dataDF = mergeRDD.map(lambda p: Row(**{'edu_city_exp': p[1]})).toDF() ece_hashingTF = HashingTF(inputCol='edu_city_exp', outputCol='eceFeatures', numFeatures=64) featuresData = ece_hashingTF.transform(dataDF) ece_idf = IDF(inputCol='eceFeatures', outputCol='ecefeatures') ece_idfModel = ece_idf.fit(featuresData) return ece_idfModel
def term_frequency(df, column): """ Compute term-frequency of a token contained in a column. Transformation: array<string> --> vector """ tf = HashingTF(inputCol=column, outputCol='_'+column) df = tf.transform(df) df = replace(df, column, '_'+column) return df
def nl_idf(self, mergeRDD): dataDF = mergeRDD.map( lambda p: Row(**{'leibie and name': p[2]})).toDF() nl_hashingTF = HashingTF(inputCol='leibie and name', outputCol='nlFeatures', numFeatures=256) featuresData = nl_hashingTF.transform(dataDF) nl_idf = IDF(inputCol='nlFeatures', outputCol='nlfeatures') nl_idfModel = nl_idf.fit(featuresData) return nl_idfModel
def extract_tf_features(p_df, input_col, output_col): """ Extracts TF features. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ hashingTF = HashingTF(inputCol=input_col, outputCol=output_col, numFeatures=3000) return hashingTF.transform(p_df)
def tokenize(df): tokenizer = Tokenizer(inputCol="itemdesc", outputCol="tokenizedText") tokenizedData = tokenizer.transform(df) numFeatures = 1000 hashingScheme = HashingTF(inputCol="tokenizedText", outputCol="features", numFeatures=numFeatures) featurizedData = hashingScheme.transform(tokenizedData) processedData = featurizedData.withColumn("label", featurizedData["label"]) \ .select(["features", "label"]) return processedData
def run_tf_idf_spark_ml(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) return idfModel.transform(featurizedData)
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n): global idfModel hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n) featurizedData = hashingTF.transform(dataframe) idf = IDF(inputCol=in_col2, outputCol=out_col2) idfModel = idf.fit(featurizedData) dataframe = idfModel.transform(featurizedData) return dataframe
def vectorizer_pipeline(preprocessed_df): """ Generate Feature Vectors from the Pre-processed corpus using the hashingTF transformer on the filtered, stemmed and normalised list of Tokens """ hashingTF = HashingTF(inputCol="tokens", outputCol="features", numFeatures=280) features_df = hashingTF.transform(preprocessed_df) # Return the final vectorized DataFrame return features_df
def train(spark): sc = spark.sparkContext tokenizer = Tokenizer(inputCol="sentence", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8000) idf = IDF(inputCol="rawFeatures", outputCol="features") srcdf = sc.textFile('part.csv').map(parse_line) srcdf = srcdf.toDF() training, testing = srcdf.randomSplit([0.9, 0.1]) wordsData = tokenizer.transform(training) featurizedData = hashingTF.transform(wordsData) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.persist() trainDF = rescaledData.select("features", "label").rdd.map( lambda x: Row(label=float(x['label']), features=Vectors.dense(x['features']))).toDF() naivebayes = NaiveBayes() model = naivebayes.fit(trainDF) testWordsData = tokenizer.transform(testing) testFeaturizedData = hashingTF.transform(testWordsData) testIDFModel = idf.fit(testFeaturizedData) testRescaledData = testIDFModel.transform(testFeaturizedData) testRescaledData.persist() testDF = testRescaledData.select("features", "label").rdd.map( lambda x: Row(label=float(x['label']), features=Vectors.dense(x['features']))).toDF() predictions = model.transform(testDF) predictions.show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("The accuracy on test-set is " + str(accuracy)) model.save('Bayes20000')
def nlpTransform(data): tokenizer = Tokenizer(inputCol="combi_text", outputCol="words") wordsData = tokenizer.transform(data) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) scaler = StandardScaler(inputCol="rawFeatures", outputCol="features", withStd=True, withMean=False) featureData = scaler.fit(featurizedData) featureD = featureData.transform(featurizedData) return featureD
def test_apply_binary_term_freqs(self): df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) n = 10 hashingTF = HashingTF() hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) output = hashingTF.transform(df) features = output.select("features").first().features.toArray() expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() for i in range(0, n): self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(features[i]))
def classify_tweets(inbound_dataset): # Run the cleansing UDF for tweet column udf_cleansing = functions.udf(cleansing) inbound_dataset = inbound_dataset.withColumn( "tweet_cleansed", udf_cleansing(functions.col("tweet"))) # Tokenizing from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words") inbound_dataset = tokenizer.transform(inbound_dataset) # Generating features from pyspark.ml.feature import HashingTF features_generator = HashingTF(inputCol="words", outputCol="features") inbound_dataset = features_generator.transform(inbound_dataset) model_folder = os.path.join(os.getcwd(), "saved_models") model_full_path = os.path.join(model_folder, "twitter_sentiment_spark") if not os.path.exists(model_folder): print("model does not exists") from pyspark.ml.classification import NaiveBayesModel loaded_model = NaiveBayesModel.load(model_full_path) # Classifying using saved model classified = loaded_model.transform(inbound_dataset) spark = getSparkSessionInstance(inbound_dataset.rdd.context.getConf()) if files_source == "hdfs": labels = spark.read.load(os.path.join("file://" + model_folder, "labels.csv"), format="csv", header=True) else: labels = spark.read.load(os.path.join(model_folder, "labels.csv"), format="csv", header=True) classified = classified.join(labels, classified["NB_pred"] == labels["label_id"]) udf_get_probability = functions.udf(get_probability) classified = classified.withColumn( "probability", udf_get_probability(functions.col("NB_prob"), functions.col("NB_pred"))) classified = classified.withColumn( "label_predicted", functions.when(classified.probability < probability_threshold, "2").otherwise(classified.label_predicted)) return classified
def termFrequency(table): #calculates the term frequency of attributes hashingTF = HashingTF(inputCol='key_words', outputCol='hashing') tf = hashingTF.transform(table) tf.cache() #normalises the term frequency data normalizer = Normalizer(inputCol='hashing', outputCol='norm') term = normalizer.transform(tf) return term
def predictLabel(label,title,model): """预测新闻的标签""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) return myprediction
def create_features(raw_data): #Create DataFrame data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2]))) #Transform sentence into words tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_df = tokenizer.transform(data_df) #Calculate term frequency hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5) featurized_df = hashingTF.transform(words_df) #Calculate inverse document frequency idf = IDF(inputCol='rawFeatures', outputCol='features') idfModel = idf.fit(featurized_df) return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col): #from pyspark.sql.functions import udf #from pyspark.sql.types import * output_raw_col = ip_col+"raw_features" output_col = ip_col+"features" hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features) featurizedData = hashingTF.transform(df) idf = IDF(inputCol=output_raw_col, outputCol=output_col) idfModel = idf.fit(featurizedData) rescaled_data = idfModel.transform(featurizedData) rescaled_data.show(5) print(rescaled_data.count()) return rescaled_data
def makeTFIDF(sc, spark, reviews): # count vectorizer and tfidf # cv = CountVectorizer(inputCol='words_clean', outputCol='tf') # cvModel = cv.fit(reviews) # reviews = cvModel.transform(reviews) # HashingTF for fewer dimensions: hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000) reviews = hashingtf.transform(reviews) # create TF-IDF matrix idf = IDF().setInputCol('tf').setOutputCol('tfidf') tfidfModel = idf.fit(reviews) reviews = tfidfModel.transform(reviews)
def append_tf_idf(self, df): """ Calculate term frequency and inverse document frequency based on at least 1 visit hourly in this case. Compares how often the tokens appeared at least once per hour compared to other tokens. Not used for the main purpose of the project. Args: :param df: Dataframe parameter. Returns: :return: Dataframe with term frequency and inverse document frequency added in the columns 'rawFeatures' and 'features' respectively. """ #Create TF column. hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000) tf = hashingTF.transform(df) tf.persist(StorageLevel.MEMORY_AND_DISK) #Create IDF column. idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) return tfidf
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tf_idf_sample") \ .master("local[*]") \ .getOrCreate() df1 = spark.createDataFrame([ (0, "a a a b b c"), (0, "a b c"), (1, "a c a a d")]).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") # 각 문장을 단어로 분리 df2 = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20) df3 = hashingTF.transform(df2) df3.cache() idf = IDF(inputCol="TF-Features", outputCol="Final-Features") idfModel = idf.fit(df3) rescaledData = idfModel.transform(df3) rescaledData.select("words", "TF-Features", "Final-Features").show() spark.stop
data = pd.read_csv("sms_spam.csv") #print(data.head(5)) ##creating rdd file sc = SparkContext("local", "app") sqc = SQLContext(sc) df = sqc.createDataFrame(data, ['type', 'text']) #NEW VARIABLE GENERATION dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text']))) dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1])) dfClean = sqc.createDataFrame(dataClean, ['label', 'words']) dfClean.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000) tf = hashingTF.transform(dfClean) idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf) dfFinal = idf.transform(tf) # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal) # Split the data into training and test sets (20% held out for testing) (trainingData, testData) = dfFinal.randomSplit([0.8, 0.2]) # Train the model. #rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. Remove stop words meaningful_words = [w for w in words if not w in stops] # # 5. Join the words back into one string separated by space, # and return the result. return " ".join( meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features")
def main(sc, sqlContext): start = timer() stpwrds = stopwords.words('english') tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N')) print '---Pegando produtos---' start_i = timer() productRDD = sc.parallelize(findProductsByCategory([])) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3])) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] )) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Pegando e persistindo dados de categoria e tokens---' start_i = timer() tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect() numTokens = len(tokens) category = productRDD.map(lambda x: x[2]).distinct().collect() categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect() insertTokensAndCategories(tokens, category, categoryAndSubcategory) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos produtos---' start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataDF = sqlContext.createDataFrame(wordsData) #persistindo para a predicao wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction) if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features)) VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo Naive Bayes---' start_i = timer() model = NaiveBayes.train(VSMTrain) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria") model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') print '####levou %d segundos' % (timer()-start_i) print '---Testando modelo Naive Bayes---' start_i = timer() prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)])) acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print '---Pegando os posts---' start_i = timer() posts = list() wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx') sheet = wb['Menes'] for row in sheet.iter_rows(row_offset=1): post = list() for cell in row: if cell.value is None: break post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value)) if len(post) > 0: posts.append(tuple(post)) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() postsRDD = sc.parallelize(posts) postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower()))) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds])) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos Posts---' start_i = timer() wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1])) wordsDataDF = sqlContext.createDataFrame(wordsData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo SVM---' start_i = timer() model = SVMWithSGD.train(VSMTrain, iterations=100) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm") model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") print '---Testando modelo SVM---' start_i = timer() prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features))) acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print 'O processo todo levou %d segundos' % (timer()-start)
from pyspark.sql import Row from pyspark.ml.feature import HashingTF, IDF, Tokenizer df = spark.read.load('/home/manh/Documents/data/result_pre.parquet') df = df.select('id', 'stemmed') rdd = df.select('stemmed').rdd pre_idf = rdd.map(lambda x: set(x[0])).flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) pre_idf_collect = pre_idf.collect() rdd_words = pre_idf.map(lambda x: Row(word=[x[0]])) df_words = spark.createDataFrame(rdd_words) hashingTF = HashingTF(inputCol="word", outputCol="rawFeatures", numFeatures=100000) featurizedData = hashingTF.transform(df_words) featurizedData.rdd.map(lambda x: (x.word[0], x['rawFeatures'].indices[0])).map(lambda x: '%s %s' % (x)).collect()
(20,"Apple iPhone Apple iPhone 6 16GB 412 2 cell 2895"), (20,"iPhone 6 T Mobile 16 GB"), (20,"Apple 6 16gb T Mobile") ], ["label","text"]) # Learn a mapping from words to Vectors. #word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="textVec") #model = word2Vec.fit(documentDF) #result = model.transform(documentDF) #print result.take(2) tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText") tokenizedTextData = tokenizer.transform(documentDF) hashingTF = HashingTF(inputCol="tokenizedText", outputCol="rawFeatures") featurizedData = hashingTF.transform(tokenizedTextData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) result1 = idfModel.transform(featurizedData) for features_label in result.select("label","pcaFeatures").take(10): print(features_label) wordsvectors = result["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt") partst = linest.map(lambda l: l.split(",")) ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0)) alldata = f.union(ft) schemaApp = sqlContext.createDataFrame(alldata) schemaApp.registerTempTable("data") tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms") permsData = tokenizer.transform(schemaApp) hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures") featurizedData = hashingTF.transform(permsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1])) model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100) labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(wordsvectors.count()) print("Training Error = " + str(trainErr))
from pyspark.ml.feature import RegexTokenizer, HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import RandomForest ## Load Dataset df_pandas = pd.read_csv('sample.csv') ## Convert to Spark Dataframe sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(df_pandas) ## Tokenizer and Hashing tokenizer = RegexTokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features") df_feat = hashingTF.transform(tokenizer.transform(df)) ## Create LabeledPoint and Features for Prediction (predict the 1s observations) lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features)) predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features) ## Compare predictions from Different Models ## Logistic Regression lrm = LogisticRegressionWithSGD.train(lp, iterations=10) logit_predict = lrm.predict(predict_feat) logit_predict.sum() #9112
# COMMAND ---------- from pyspark.ml.feature import HashingTF, IDF tf = HashingTF()\ .setInputCol("DescOut")\ .setOutputCol("TFOut")\ .setNumFeatures(10000) idf = IDF()\ .setInputCol("TFOut")\ .setOutputCol("IDFOut")\ .setMinDocFreq(2) # COMMAND ---------- idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False) # COMMAND ---------- from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. documentDF = spark.createDataFrame([ ("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), ) ], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF)
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3])) .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3])) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0)) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = (tfIDF .filter(tfIDF.type==u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features))) .filter(lambda p: p[2]==1) .map(lambda p: (p[0], p[1])) .groupByKey() .mapValues(list) .collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type==category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf .map(lambda x: (post, x.label, cossine(x.features, postVector))) .filter(lambda x: x[2]>=threshold) .collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
# les vecteurs de features : c'est ce qui marche le mieux apparemment. from pyspark.ml.feature import Normalizer normalizerUni = Normalizer(inputCol='words',outputCol='normWords',p=2.0) normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words','normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF) dfTrainTFIDF = idfModel.transform(dfTrainTF) dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show() # Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review','label','target_indexed').show()