def vectorizeCV(DF): vectorizer = CountVectorizer() #cv = CountVectorizer(minDF=.0001, inputCol="raw", outputCol="features", binary=True) cv = CountVectorizer(minDF=1, inputCol="raw", outputCol="features", binary=True) model = cv.fit(DF) result = model.transform(DF) return result, model
def pre_processing(df): # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) model = cv.fit(df) result = model.transform(df) result.show(truncate=False)
def functions_for_deal_with_texts_3(spark, resources_folder): df = spark.createDataFrame([(0, "a b c".split(" ")), (1, "a b b c a".split(" "))], ["id", "words"]) df.show() cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0) model = cv.fit(df) result = model.transform(df) result.show(truncate=False)
def convertToVec(df, sc, ss, outputName, inputCol='tokens'): cv=CountVectorizer(inputCol=inputCol, outputCol='vectors',minTF=1.0) vecModel=cv.fit(df) print('\n\n\n Get Vocab... \n\n\n') inv_voc=vecModel.vocabulary f = codecs.open(outputName+'_vocab.txt', encoding='utf-8', mode='w') for item in inv_voc: f.write(u'{0}\n'.format(item)) f.close() vectors= vecModel.transform(df).select('id','subreddit','vectors') return vectors
def sparsify(ngrams_df, model): if model is None: # TASK 6a: Binary CountVectorizer cv = CountVectorizer(minDF=10, binary=True, inputCol="split_ngrams", outputCol="sparse_vector") model = cv.fit(ngrams_df) sparsified = model.transform(ngrams_df) return model, sparsified
def main(): for tn in tablenames: data = spark.read.format("org.apache.spark.sql.cassandra")\ .options(table=tn, keyspace=keyspace).load().limit(1000) data = data.sort('imdb_score', ascending=False) desc = data.rdd.map(lambda x: x['description']).filter( lambda x: x is not None) StopWords = nltk.corpus.stopwords.words('english') StopWords.extend([" ... See full summary"]) tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\ .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\ .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex() df_txts = spark.createDataFrame(tokenized, ["words", 'index']) countVec = CountVectorizer(inputCol="words", outputCol="raw_features", vocabSize=5000, minDF=10.0) CountVectMod = countVec.fit(df_txts) result = CountVectMod.transform(df_txts) idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result) resultTFIdf = idfModel.transform(result) totalTopics = 10 totalItr = 100 LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\ k=totalTopics, maxIterations=totalItr) maxwordsTopic = 5 topicIndices = sc.parallelize( LDAModel.describeTopics(maxTermsPerTopic=5)) VCarr = CountVectMod.vocabulary def finalTopic(topic): terms = topic[0] result = [] for i in range(maxwordsTopic): term = VCarr[terms[i]] result.append(term) return result topics_final = topicIndices.map( lambda topic: finalTopic(topic)).collect() print(topics_final) for topic in range(len(topics_final)): print("Topic" + str(topic) + ":") for term in topics_final[topic]: print(term) print('\n')
def calTFIDF(self, dataset, colName): cv = CountVectorizer(inputCol=colName, outputCol="rawFeatures") cvmodel = cv.fit(dataset) featurizedData = cvmodel.transform(dataset) vocab = cvmodel.vocabulary vocab_broadcast = sparkTest.sparkContext.broadcast(vocab) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # TFIDF return self.clusteredData(rescaledData, cvmodel)
def task6a_create_feature_vector(labeled_comments, countVecModel=None): # TASK 6A # Code for task 6A... if countVecModel is None: cv = CountVectorizer(inputCol="ngrams", outputCol="features", minDF=10.0, binary=True) my_countVecModel = cv.fit(labeled_comments) else: my_countVecModel = countVecModel result = my_countVecModel.transform(labeled_comments) return result, my_countVecModel
def create_dictionary(rdd, dict_length): from pyspark.ml.feature import CountVectorizer df = rdd.toDF(['text', 'rating']) filled_df = df.na.fill(0) cv = CountVectorizer(inputCol="text", outputCol="vectors", vocabSize=dict_length) model_cv = cv.fit(filled_df) dictionary = { k: v + DICTIONARY_OFFSET for v, k in enumerate(model_cv.vocabulary) } return dictionary
def test_count_vectorizer_with_binary(self): dataset = self.spark.createDataFrame([ (0, "a a a b b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),), (1, "a a".split(' '), SparseVector(3, {0: 1.0}),), (2, "a b".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),), (3, "c".split(' '), SparseVector(3, {2: 1.0}),)], ["id", "words", "expected"]) cv = CountVectorizer(binary=True, inputCol="words", outputCol="features") model = cv.fit(dataset) transformedList = model.transform(dataset).select("features", "expected").collect() for r in transformedList: feature, expected = r self.assertEqual(feature, expected)
def train(labeled_df): ''' train to get pos and neg models ''' cv = CountVectorizer(inputCol="ngrams_combined", binary=True, outputCol="features", minDF=10.0) cvModel = cv.fit(labeled_df) labeled_df = cvModel.transform(labeled_df) cvModel.save("cvModel") poslr = LogisticRegression(labelCol="poslabel", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="neglabel", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel") negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel") # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 pos = labeled_df neg = labeled_df posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("pos.model") negModel.save("neg.model") return cvModel, posModel, negModel
def count_vectorizer_usecase(): spark = getSparkSession() df = spark.createDataFrame([(0, "a b".split(" ")), (1, "a b b c a".split(" "))], ["id", "words"]) """ vocabSize=>指定字典的大小 minDF=>指定最少的文档数目 """ cv = CountVectorizer(inputCol="words", outputCol="features") model = cv.fit(df) result = model.transform(df) result.show(truncate=False)
def fit(self): sqlContext = SparkSession.builder.getOrCreate() if self.test: df = sqlContext.sql( "select * from cmp_tmp_user_identification where dt='2014-01'") else: df = sqlContext.sql("select * from cmp_tmp_user_identification") if self.tweet and self.retweet: df = df.withColumn('content', F.concat('text', 'retweeted')) elif self.tweet: df = df.filter("retweeted==' '") df = df.withColumn('content', F.col('text')) elif self.retweet: df = df.filter('length(retweeted)>1') df = df.withColumn('content', F.col('retweeted')) df = df.withColumn('content', textCut(clean_text('content'))) ##stopwords remover = StopWordsRemover(inputCol="content", outputCol="words", stopWords=self.stopwords) df = remover.transform(df) ## 清理空字符 df = df.filter('size(words)>0') self.sentence_length_distribution = df.selectExpr( 'size(words) as wz').groupBy('wz').count().toPandas().set_index( 'wz').sort_index() ###vec cv = CountVectorizer(inputCol='words', outputCol='vertors', minDF=self.minDF, minTF=self.minTF) model_cv = cv.fit(df) word2bag = model_cv.vocabulary self.baglen = len(word2bag) self.dictionary = dict( zip(word2bag, ['W' + str(i) for i in range(1, self.baglen)])) sc = SparkContext.getOrCreate() diction = sc.broadcast(self.dictionary) ## to English format to GCN df = df.withColumn('words_space', toSpaceSplit('words')) result_df = df.selectExpr('uid,label,identity,words_space'.split(',')) ##aggregate to user level result_df = result_df.groupBy('uid', 'label', 'identity').agg( F.collect_list('words_space').alias('uid_words')) result_df = result_df.withColumn('uid_words', concat_uid('uid_words')) return result_df
def vectorize_data(self): """ Convert each list of tokens into vectors of token counts :return: vectors of token counts """ columns = self.df.schema.names for column_name in columns: if "_words" in column_name: count = CountVectorizer(inputCol=column_name, outputCol=column_name + "_raw_features") model = count.fit(self.df) self.df = model.transform(self.df) self.df.drop(column_name).collect()
def vectorizeCV(fullDF, sampleDF, minDocFrec): vectorizer = CountVectorizer() cv = CountVectorizer(minDF=minDocFrec, inputCol="raw", outputCol="features", binary=True) if sampleDF == None: model = cv.fit(fullDF) else: model = cv.fit(sampleDF) result = model.transform(fullDF) return result, model
def preprocess(self,df): # convert input dataframe to document. document_assembler = DocumentAssembler() \ .setInputCol("headline_text") \ .setOutputCol("document") \ .setCleanupMode("shrink") # Split sentence to tokens(array) tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") # clean normalizer = Normalizer() \ .setInputCols(["token"]) \ .setOutputCol("normalized") # remove stopwords stopwords_cleaner = StopWordsCleaner() \ .setInputCols("normalized") \ .setOutputCol("cleanTokens") \ .setCaseSensitive(False) # stem the words to bring them to the root form. stemmer = Stemmer() \ .setInputCols(["cleanTokens"]) \ .setOutputCol("stem") # bring back the expected structure viz. array of tokens. finisher = Finisher() \ .setInputCols(["stem"]) \ .setOutputCols(["tokens"]) \ .setOutputAsArray(True) \ .setCleanAnnotations(False) # build preprocess pipeline preprocess_pipeline = Pipeline( stages=[document_assembler, tokenizer, normalizer, stopwords_cleaner, stemmer, finisher]) # train the pipeline preprocess = preprocess_pipeline.fit(df) # apply the pipeline to transform dataframe. processed_df = preprocess.transform(df) # select the columns that we need tokens_df = processed_df.select('publish_date','tokens').limit(10000) cv = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=500, minDF=3.0) # train the model cv_model = cv.fit(tokens_df) # transform the data. Output column name will be features. vectorized_tokens = cv_model.transform(tokens_df)
def main(context): """Main function takes a Spark SQL context.""" """Task1""" #commentsDF = sqlContext.read.json("comments-minimal.json.bz2") #submissionsDF = sqlContext.read.json("submissions.json.bz2") #commentsDF.write.parquet("comments.parquet") #submissionsDF.write.parquet("submissions.parquet") labeledDF = sqlContext.read.format("csv").options( header='true', inferschema='true').load("labeled_data.csv") commentsDF = sqlContext.read.parquet("comments.parquet") submissionsDF = sqlContext.read.parquet("submissions.parquet") """Task2""" #data = labeled_data.join(comments, comments("id")===labeled_data("Input_id"), "inner").select("id","body","labeldem","labelgop","labeldjt") commentsDF.createOrReplaceTempView("comments") labeledDF.createOrReplaceTempView("labeled_data") dataDF = sqlContext.sql( "SELECT id, body, labeldem, labelgop, labeldjt FROM comments INNER JOIN labeled_data ON comments.id = labeled_data.Input_id" ) '''drop the temp view to save memory (RAM)''' """Task4""" dataDF.createOrReplaceTempView("data") sqlContext.udf.register("sanitize_udf", sanitize) dataDF = sqlContext.sql("SELECT *, sanitize_udf(body) AS ngrams FROM data") """Task5""" dataDF.createOrReplaceTempView("data") sqlContext.udf.register("select_udf", select) data = sqlContext.sql( "SELECT id, body, labeldem, labelgop, labeldjt, select_udf(ngrams) AS selected_ngrams FROM data" ) #data = sqlContext.sql("SELECT id, body, labeldem, labelgop, labeldjt, vectorize_udf(ngrams) AS vectorized_ngrams FROM data") """Task6A""" vectorized_data = data.withColumn( "selected_ngrams", split(col("selected_ngrams"), " ").cast(ArrayType(StringType()))) cv = CountVectorizer(inputCol="selected_ngrams", outputCol="vector", minDF=5.0) cv_model = cv.fit(vectorized_data) vectorized = cv_model.transform(vectorized_data) #vectorized.show(1, truncate=False) """Task6B""" vectorized.createOrReplaceTempView("Vectorized") sqlContext.udf.register("check_pos", check_pos) sqlContext.udf.register("check_neg", check_neg) new_vectorized = sqlContext.sql( "SELECT id, body, labeldem, labelgop, labeldjt, selected_ngrams, vector, check_pos(labeldjt) AS pos_label, check_neg(labeldjt) AS neg_label FROM Vectorized" ) new_vectorized.show(3, False)
def main(*args): if len(args) != 2: print("Please provide one input and one output directories!") sys.exit(1) input_fn, output_fn = args[0],args[1] conf = SparkConf() conf.setAppName("grant") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Load the abstract content in the test folder into spark, # clean text, tokenize the corpus, and stem the words abstract = sc.textFile(input_fn) df_abs = (abstract.map(lambda doc: text_cleaning(doc)) .filter(lambda doc: len(doc) > 0) .filter(lambda line: not line.startswith('app')) .map(lambda doc: doc.split(' ')) .map(lambda word: [x for x in word if len(x)>0]) .map(lambda word: stem(word)) .map(lambda doc: (int(doc[0]), doc[1:])) .filter(lambda doc: len(doc[1])>0) .toDF(['Id','words'])) # build the pipeline and lda model with online optimizer stop_words = StopWordsRemover(inputCol='words', outputCol='clean') stop_words.setStopWords(stop_words.loadDefaultStopWords('english')) countv = CountVectorizer(inputCol=stop_words.getOutputCol(), outputCol="tokens") idf = IDF(inputCol=countv.getOutputCol(),outputCol="features") lda = LDA(maxIter=10,k=10,optimizer='online') pipeline = Pipeline(stages=[stop_words, countv, idf, lda]) lda_model = pipeline.fit(df_abs) labels = lda_model.transform(df_abs) # identify the label as the topic with the max probability # save the label to file topic_labels = (labels.select('Id','topicDistribution') .rdd .map(lambda x: (x[0],np.argmax(x[1]))) .saveAsTextFile(os.path.join(output_fn,'labels'))) # Get the topics wordnum = 5 # choose the number of topic words vocabulary = lda_model.stages[1].vocabulary voc_bv = sc.broadcast(vocabulary) topic_df = (lda_model.stages[3].describeTopics(wordnum) .rdd .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2])) .saveAsTextFile(os.path.join(output_fn,'words')))
def LDAThis(sc, RDD, minFreq, numTopics, maxIter, wordsPerTopic): ''' Arguments: sc: A SparkContext Object RDD: An RDD with rows as tokenized sentences minFreq: Minimum document frequency for CountVectorizer numTopics: Number of Topics maxIter: Max number of iterations for LDA train wordsPerTopic: Number of words to show per topic topWords: Number of words to show per topic Requirements sqlContext = SQLContext(sc) <- must be defined outside function ''' StopWords = stopwords.words("english") sqlContext = SQLContext(sc) # Structure Data idRDD = RDD.map( lambda words: [x for x in words if x.isalpha() and x not in StopWords ]).filter(lambda x: len(x) > 2).zipWithIndex() idDF = sqlContext.createDataFrame(idRDD, ["tokens", 'index']) # Term Frequency CVecModel = CountVectorizer(inputCol="tokens", outputCol="rawFeatures", vocabSize=5000, minDF=minFreq).fit(idDF) resultCVec = CVecModel.transform(idDF) vocabArray = CVecModel.vocabulary #IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(resultCVec) resultTFIDF = idfModel.transform(resultCVec) # LDA resultLDA = LDA.train(resultTFIDF.select( 'index', 'features').rdd.mapValues(Vectors.fromML).map(list), k=numTopics, maxIterations=maxIter) topicIndices = sc.parallelize( resultLDA.describeTopics(maxTermsPerTopic=wordsPerTopic)) topicsFinal = topicIndices.map(lambda topic: render_topics( topic, wordsPerTopic, vocabArray)).collect() # Show Topics for topic in range(len(topicsFinal)): print("Topic" + str(topic) + ":") for term in topicsFinal[topic]: print(term) print('\n') return resultLDA
def preprocess(inputCol=["text", "label"], n=4): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] remover = [StopWordsRemover(inputCol="words", outputCol="filtered")] ngrams = [ NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=2) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)
def base_features_gen_pipeline(input_descript_col="descript", input_category_col="category", output_feature_col="features", output_label_col="label"): ''' Token -> Vectors -> Label -> selector/transformer -> pipeline ''' #tokenizing the reviews in the input word_tokenizer = Tokenizer(inputCol="descript", outputCol="words") #Count Vectorizing using Bag of Words model count_vectors = CountVectorizer(inputCol="words", outputCol="features") #Labelling data for supervised learning label_maker = StringIndexer(inputCol="category", outputCol="label") #Transformer selector = Selector(outputCols=['id', 'features', 'label']) #constructing the data pipeline = Pipeline( stages=[word_tokenizer, count_vectors, label_maker, selector]) return pipeline
def run_ml_pipeline(nlpPipelineDF, num_topics, max_iterations, vocabSize, minDF, maxDF): """Define a Spark LDA topic modelling pipeline""" cv = CountVectorizer( inputCol="allTokens", outputCol="features", vocabSize=vocabSize, minDF=minDF, maxDF=maxDF, minTF=1.0, ) idf = IDF(inputCol="features", outputCol="idf") lda = LDA( k=num_topics, maxIter=max_iterations, optimizer="online", seed=1, learningOffset= 100.0, # If high, early iterations are downweighted during training learningDecay= 0.51, # Set between [0.5, 1) to guarantee asymptotic convergence ) mlPipeline = Pipeline(stages=[cv, idf, lda]) mlModel = mlPipeline.fit(nlpPipelineDF) ldaModel = mlModel.stages[2] return mlModel, ldaModel
def fit_tfidf_pipeline(content_df): tokenizer = RegexTokenizer(). \ setGaps(False). \ setPattern('\\p{L}+'). \ setInputCol('content'). \ setOutputCol('words') sw = StopWordsRemover() \ .setStopWords(stop_words) \ .setCaseSensitive(False) \ .setInputCol("words") \ .setOutputCol("filtered") cv = CountVectorizer(). \ setInputCol('filtered'). \ setOutputCol('tf'). \ setMinTF(1). \ setMinDF(10). \ setVocabSize(2 ** 17) # fit dataframe_df cv_transformer = Pipeline(stages=[tokenizer, sw, cv]).fit(content_df) idf = IDF(minDocFreq=10). \ setInputCol('tf'). \ setOutputCol('tfidf') tfidf_transformer = Pipeline(stages=[cv_transformer, idf]).fit(content_df) return tfidf_transformer
def UsefulnessPredictionLDA(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") paramGrid = ParamGridBuilder() \ .addGrid(cv.vocabSize, [150, 200, 250]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator_rmse, numFolds=4) # use 3+ folds in practice cvModel = crossval.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def main(): subreddit_group = spark.read.parquet(input_file).repartition(2000) # subreddit_group.show() #hashing = HashingTF(inputCol="comments", outputCol="features") count_vectorizer = CountVectorizer(inputCol="comments", outputCol="features") lda = LDA(k=10, maxIter=10, optimizer='online') pipeline = Pipeline(stages=[count_vectorizer, lda]) model = pipeline.fit(subreddit_group) predictions = model.transform(subreddit_group).selectExpr( 'id', 'topicDistribution') change_to_str = F.udf(to_text) topics_df = predictions.select( predictions['id'], change_to_str( predictions['topicDistribution']).alias('topicDistribution')) #topics_df.show(20, False) topics_df.write.option('sep', ',').save(output, format='csv', mode='overwrite')
def main(): spark = SparkSession.builder.appName('nlp').getOrCreate() data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection", inferSchema=True, sep='\t') data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text') data.show() data = data.withColumn('length', length(data['text'])) data.show() data.groupby('class').mean().show() tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label') clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') nb = NaiveBayes() data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up]) cleaner = data_prep_pipe.fit(data) clean_data = cleaner.transform(data) clean_data = clean_data.select(['label', 'features']) clean_data.show() (training, testing) = clean_data.randomSplit([0.7, 0.3]) spam_predictor = nb.fit(training) data.printSchema() test_results = spam_predictor.transform(testing) test_results.show() acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("Accuracy of model at predicting spam was: {}".format(acc))
def build_pipeline(): tokenizer = [Tokenizer(inputCol='tweet', outputCol='words')] ngrams = [ NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i)) for i in range(1, 4) ] cv = [ CountVectorizer(vocabSize=5460, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1, 4) ] idf = [ IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=5) for i in range(1, 4) ] assembler = [ VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1, 4)], outputCol='features') ] label_stringIdx = [StringIndexer(inputCol='sentiment', outputCol='label')] lr = [LogisticRegression(maxIter=100)] pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_stringIdx + lr) return pipeline
def base_features_gen_pipeline(input_descript_col="descript", input_category_col="category", output_feature_col="features", output_label_col="label"): #Build the pipeline # white space expression tokenizer word_tokenizer = Tokenizer(inputCol="descript", outputCol="words") # bag of words count count_vectors = CountVectorizer(inputCol="words", outputCol="features") # label indexer label_maker = StringIndexer(inputCol="category", outputCol="label") class Selector(Transformer): def __init__(self, outputCols=['id', 'features', 'label']): self.outputCols = outputCols def _transform(self, df: DataFrame) -> DataFrame: return df.select(*self.outputCols) selector = Selector(outputCols=['id', 'features', 'label']) # build the pipeline pipeline = Pipeline( stages=[word_tokenizer, count_vectors, label_maker, selector]) return pipeline
def build_ngrams(n=3): tokenizer = [Tokenizer(inputCol="text", outputCol="tokens")] stopwordsRemover = [ StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered') ] ngrams = [ NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=5460, inputCol="{0}_grams".format(i), outputCol="{0}_cv".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_cv".format(i), outputCol="{0}_idf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_idf".format(i) for i in range(1, n + 1)], outputCol="features") ] stringIndexer = [StringIndexer(inputCol="class", outputCol="label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + stringIndexer + lr)
def train_idf_model(): rests = biz.filter(if_rest_udf(biz.categories)) rest_rev = rev.join( rests.select('business_id', 'stars').withColumnRenamed('stars', 'rating'), 'business_id') bad_reviews = rest_rev.filter('stars < 3') #sample for train bad_sample = bad_reviews.sample(False, 0.127, seed=91) sample_token = data_tokenizer(bad_sample) splits = sample_token.randomSplit([0.8, 0.1, 0.1], seed=91) train = splits[0] add_cl = splits[1] test = splits[2] cv = CountVectorizer(minDF=5, vocabSize=5000, inputCol='token', outputCol='vectors') idf = IDF(minDocFreq=7, inputCol="vectors", outputCol="features") km2 = KMeans(k=18, featuresCol='features', maxIter=30) pipe_idf = Pipeline(stages=[cv, idf, km2]) pipe_idf_model = pipe_idf.fit(train) return pipe_idf
def ngramFeatureExtractors(n, inputCol=["text", "target"]): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] count_vectorizer = [ CountVectorizer(vocabSize=5460, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="features") ] label_stringIdx = [StringIndexer(inputCol="target", outputCol="label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + count_vectorizer + idf + assembler + label_stringIdx + lr)
def count(df, column): """ Count the number of occurences of terms in documents. """ # fit a CountVectorizerModel from the corpus. # vocabSize: top N words orderedby term frequency across the corpus # minDF: minimum number of documents a term must appear in to be # included in the vocabulary # e.g. vocabSize=10, minDF=2.0 cv = CountVectorizer(inputCol=column, outputCol='_'+column) model = cv.fit(df) voc = model.vocabulary df = model.transform(df) df = replace(df, column, '_'+column) return (df, voc)
def featurizeData(raw, gap, vocabFile, featFile): feats = raw.dropDuplicates(['cluster', 'series', 'date'])\ .withColumn('day', datediff(col('date'), lit('1970-01-01')))\ .na.drop(subset=['day'])\ .rdd.groupBy(lambda r: r.cluster)\ .flatMap(lambda c: clusterFeatures(c, gap))\ .toDF() feats.cache() cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0) interner = cv.fit(feats) # alternate possibility: grab features only from label==1 edges full = interner.transform(feats) # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features') # # I don't think a Pipeline will work here since we need to get the interner.vocabulary # full = combiner.transform(interner.transform(feats)).drop('categorial') full.write.parquet(featFile) np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s') feats.unpersist()
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
def main(): p = sys.argv[1] logFile = "data/" + p + "_cleaned.txt" sc = SparkContext("local", "simpleApp") sqlContext = SQLContext(sc) data = sc.textFile(logFile).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))).cache() docDF = sqlContext.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus_size = result.count() corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online') topics = ldaModel.topicsMatrix() wordNumbers = 10 topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers)) vocabArray = model.vocabulary topics_final = topicIndices.map(lambda topic: topic_render(topic,wordNumbers,vocabArray)).collect() path = "data/" + p + "_results.txt" json = open(path, 'wb') json.close() for topic in topics_final: for term in topic: line = term[0] + " " try: string_for_output = line.encode('utf8', 'replace') if string_for_output != " ": os.system("python3 basic/codes/p3p.py " + string_for_output + " >> " + path) except: pass os.system("python3 basic/codes/p3p.py " + "delmch" + " >> " + path)
def test_count_vectorizer_with_maxDF(self): dataset = self.spark.createDataFrame([ (0, "a b c d".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),), (1, "a b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),), (2, "a b".split(' '), SparseVector(3, {0: 1.0}),), (3, "a".split(' '), SparseVector(3, {}),)], ["id", "words", "expected"]) cv = CountVectorizer(inputCol="words", outputCol="features") model1 = cv.setMaxDF(3).fit(dataset) self.assertEqual(model1.vocabulary, ['b', 'c', 'd']) transformedList1 = model1.transform(dataset).select("features", "expected").collect() for r in transformedList1: feature, expected = r self.assertEqual(feature, expected) model2 = cv.setMaxDF(0.75).fit(dataset) self.assertEqual(model2.vocabulary, ['b', 'c', 'd']) transformedList2 = model2.transform(dataset).select("features", "expected").collect() for r in transformedList2: feature, expected = r self.assertEqual(feature, expected)
from pyspark.sql import SQLContext, Row from pyspark.ml.feature import CountVectorizer from pyspark.mllib.clustering import LDA, LDAModel sqlContext = SQLContext(sc) path = ... # path of the txt file data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))) docDF = sqlContext.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus_size = result.count() # total number of words corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 10 # number of words per topic topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term)
def train_cv_model(modelDataframe): cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0) model = cv.fit(modelDataframe) model.write().overwrite().save("models/cvModel")
argparser.add_argument('-s', '--clusterSize', type=int, default=1) argparser.add_argument('indir', help='Input directory') argparser.add_argument('outdir', help='Output directory') args = argparser.parse_args() spark = SparkSession.builder.appName('Cluster Features').getOrCreate() df = spark.read.load(args.indir) raw = df.filter(col('size') >= args.clusterSize) \ .select('cluster', 'size', regexp_replace('text', u'\xad\s*', '').alias('text')) raw.cache() tok = RegexTokenizer(inputCol='text', outputCol='terms', gaps=False, pattern='\w+') \ .transform(raw) counts = CountVectorizer(inputCol='terms', outputCol='counts', minDF=2.0) \ .fit(tok).transform(tok) mergeCounts = udf(lambda va, size: threshold_sparse(scale_sparse(reduce(add_sparse, va), 1.0/size), args.minCount), VectorUDT()) res = counts.groupBy('cluster', 'size') \ .agg(mergeCounts(collect_list('counts'), 'size').alias('counts')) # lda = LDA(k=2, featuresCol='counts', seed=1, optimizer='em') # model = lda.fit(res) # model.describeTopics().write.json(args.outdir) res.write.json(args.outdir) spark.stop()
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
# alltags=tags_users.map(lambda x:Counter(x.tags)).reduce(lambda a,b:a+b) # print(alltags.most_common(10)) #.filter(lambda x:len(x.tags)>100) # filtering to get smaller dataset # print(tags_users.count()) # print(tags_users.first()) ## Filtered for testing tags_users_df=sqlContext.createDataFrame(tags_users) print(tags_users_df.take(2)) # # # print('Indexing strings') cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.) model=cVec.fit(tags_users_df) td=model.transform(tags_users_df) with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff: pkl.dump(model.vocabulary,ff) normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized') tdNorm=normalizer.transform(td) print(tdNorm.take(5)) tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet') samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10)
print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.drop("features")) cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("features")\ .setVocabSize(500)\ .setMinTF(0)\ .setMinDF(0)\ .setBinary(True) cvFitted = cv.fit(tokenized) prepped = cvFitted.transform(tokenized) # COMMAND ---------- from pyspark.ml.clustering import LDA lda = LDA().setK(10).setMaxIter(5) print lda.explainParams() model = lda.fit(prepped)
# COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram().setInputCol("DescOut").setN(1) bigram = NGram().setInputCol("DescOut").setN(2) unigram.transform(tokenized.select("DescOut")).show(False) bigram.transform(tokenized.select("DescOut")).show(False) # COMMAND ---------- from pyspark.ml.feature import CountVectorizer cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("countVec")\ .setVocabSize(500)\ .setMinTF(1)\ .setMinDF(2) fittedCV = cv.fit(tokenized) fittedCV.transform(tokenized).show(False) # COMMAND ---------- tfIdfIn = tokenized\ .where("array_contains(DescOut, 'red')")\ .select("DescOut")\ .limit(10) tfIdfIn.show(10, False)
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import CountVectorizer # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("CountVectorizerExample")\ .getOrCreate() # $example on$ # Input data: Each row is a bag of words with a ID. df = spark.createDataFrame([ (0, "a b c".split(" ")), (1, "a b b c a".split(" ")) ], ["id", "words"]) # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) model = cv.fit(df) result = model.transform(df) result.show(truncate=False) # $example off$ spark.stop()
#tokenizer = Tokenizer(inputCol="description", outputCol="words") #wordsData = tokenizer.transform(text) ################################################################################################ # # Generate TFIDF # ################################################################################################ # Term Frequency Vectorization - Option 1 (Using hashingTF): #hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) #featurizedData = hashingTF.transform(clean_text) # Term Frequency Vectorization - Option 2 (CountVectorizer) : cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize = 1000) cvmodel = cv.fit(clean_text) featurizedData = cvmodel.transform(clean_text) vocab = cvmodel.vocabulary vocab_broadcast = sc.broadcast(vocab) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) ################################################################################################ # # LDA Clustering - Find Data-driven Topics # ################################################################################################