def main(sc): sql_context = SQLContext(sc) all_data = get_all_data() # Input data: Each row is a bag of words from a sentence or document. training_data = [(id_gen.next(), text.split(" ")) for text in all_data] documentdf = sql_context.createDataFrame(training_data, ["id", "text"]) remover = StopWordsRemover(inputCol="text", outputCol="text_filtered") cleaned_document = remover.transform(documentdf) # Learn a mapping from words to Vectors. word2vec = Word2Vec(vectorSize=len(training_data), inputCol="text_filtered", outputCol="result") model = word2vec.fit(cleaned_document) matrix = column_similarities(model.transform(cleaned_document)) # We use the size of the target data to filter only # products of target data to filter data and avoid # products of taret data to itself values = matrix.entries.filter( lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy( keyfunc=lambda x: x.value, ascending=False).map( lambda x: x.j).distinct().take(100) training_data_index = dict(training_data) for position, item in enumerate(values): line = " ".join(training_data_index[int(item)]) print('%d -> %s' % (position, line.encode('utf-8')))
def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def remove_stop_words(p_df, in_column, out_column): """ Removes stop words from a column in a DataFrame. The column must be a list of words. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ remover = StopWordsRemover(inputCol=in_column, outputCol=out_column) return remover.transform(p_df)
def removeStopWords(df, column): """ Remove stop-words (like "the", "a", "I", etc.) from given column. The column must contain an array of strings. Transformation: array<string> --> array<string> """ # creates remover to filter out common stop-words remover = StopWordsRemover(inputCol=column, outputCol='_'+column) # transform: array<string> --> array<string> df = remover.transform(df) df = replace(df, column, '_'+column) return df
def append_tokens(self,df): """ Creates tokens from the pagename column in the dataframe then removes stop-words from the tokens. Adds the tokens under the column rawTokens and tokens. Args: :param df: Dataframe to add token columns to. Returns: :return: Dataframe with new columns rawTokens and tokens. """ #Tokenize pagename and convert tokens to their stem words. tokenize_udf = udf(tokenize_porter, returnType=ArrayType(StringType())) df = df.withColumn('rawTokens', tokenize_udf(df['pagename'])) #Remove stop words. stop_words_remover = StopWordsRemover(inputCol="rawTokens", outputCol="tokens") df = stop_words_remover.transform(df) return df
def preprocessing_titles(path,name): query = preprocessData(path) tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title") wordsData = tokenizer.transform(query) #after Stopword removal remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered") wordsData= remover.transform(wordsData) df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"]) df.registerTempTable("indices") wordsData.registerTempTable("words") qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id") if name!='': exportOnS3(qr,"s3a://redit-preprocessed/",name) qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
class SentimentalPipelineEngine(PipelineEngine): def __init__(self, cv): super(SentimentalPipelineEngine, self).__init__(cv) self.tokenizer_map = [TweetTokenizer()] self.ngram_map = [1] self.hashing_tf_map = [pow(2, 20)] self.clf_map = [0.1] self.stages = self._build_stages() self.pipeline = Pipeline(stages=self.stages) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered") self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed") self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams") self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features") self.idf = IDF(inputCol="features", outputCol="idf_features") self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0) self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1) # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2]) return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map) param_grid_builder.addGrid(self.ngram.n, self.ngram_map) param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.clf.regParam, self.clf_map) return param_grid_builder.build()
def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered") self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed") self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams") self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features") self.idf = IDF(inputCol="features", outputCol="idf_features") self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0) self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1) # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2]) return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]
def create_pipeline(model_type, num_features=10000): """ Defines pipeline from BOW to prediction. """ remover = StopWordsRemover(inputCol="bow", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") if model_type == 'log_reg': model = LogisticRegression() elif model_type == 'gbt': model = GBTClassifier() elif model_type == 'naive_bayes': model = NaiveBayes() elif model_type == 'rf': model = RandomForestClassifier() return Pipeline(stages=[remover, hashingTF, tfidf, model])
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
sc = SparkContext(appName="Tweet") spark = SparkSession(sc) sc.setLogLevel("WARN") # read the dataset training_set = spark.read.csv( '../tap/spark/dataset/training_set_sentipolc16.csv', schema=schema, header=True, sep=',') training_set # define stage 1: tokenize the tweet text stage_1 = RegexTokenizer(inputCol='tweet', outputCol='tokens', pattern='\\W') # define stage 2: remove the stop words stage_2 = StopWordsRemover(inputCol='tokens', outputCol='filtered_words') # define stage 3: create a word vector of the size 100 stage_3 = Word2Vec(inputCol='filtered_words', outputCol='vector', vectorSize=100) # define stage 4: Logistic Regression Model model = LogisticRegression(featuresCol='vector', labelCol='positive') # setup the pipeline pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, model]) # fit the pipeline model with the training data pipelineFit = pipeline.fit(training_set) modelSummary = pipelineFit.stages[-1].summary modelSummary.accuracy
def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
# split the text into tokens # removed stop words # applied the hashing trick # converted the data from counts to IDF and # trained a logistic regression model. # Each of these steps was done independently. This seems like a great application for a pipeline! # Instructions # 100 XP # Create an object for splitting text into tokens. # Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object. # Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again. # Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model. from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])
appName = "News label prediction" master = 'local' spark = SparkConf().setAppName(appName).setMaster(master).set( 'spark.executor.memory', '4G').set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G') sc = SparkContext(conf=spark) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(indexNewsList, ["label", "text"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=15, maxBins=32)
.orderBy(col("count").desc()) \ .show() # set seed for reproducibility (training_data, test_data) = df.randomSplit([0.7, 0.3], seed=100) print("Training data count: " + str(training_data.count())) print("Test data count: " + str(test_data.count())) # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "RT", "@"] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes indexer = StringIndexer(inputCol="polarity", outputCol="label") # feature-selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selectedFeatures", labelCol="label")
names_df = df.select('name') names_df.show() # listings.filter(listings["name"].isNotNull()) #%% names_df = names_df.dropna(subset='name') names_df.show() #%% tokenizer = Tokenizer(inputCol="name", outputCol="words") wordsData = tokenizer.transform(names_df) wordsData.show() #%% stopwords = [] stopwords.extend(StopWordsRemover.loadDefaultStopWords('english')) remover = StopWordsRemover(inputCol="words", outputCol="cleanedWords", stopWords=stopwords) cleanedWordsData = remover.transform(wordsData) cleanedWordsData.show() #%% hashingTF = HashingTF(numFeatures=4096, inputCol="cleanedWords", outputCol="tfFeatures") tfWordsData = hashingTF.transform(cleanedWordsData) tfWordsData.show() #%% idf = IDF(inputCol="tfFeatures", outputCol="tfIdfFeatures")
if __name__ == '__main__': conn = S3Connection() sc = set_spark_context() sqc = SQLContext(sc) sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl') logging.basicConfig(format='%(asctime)s %(message)s') grid_search = logging.getLogger('main') grid_search.setLevel(logging.DEBUG) handler = logging.FileHandler('../logs/grid_search.txt') grid_search.addHandler(handler) bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow)) bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=10000) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=20) indexer = StringIndexer(inputCol="string_label", outputCol="label") for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]: if type(model) == MultilayerPerceptronClassifier: layers = [10000, 100, 2] model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128) pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler, indexer, model]) scores = cross_val_score(pipeline, bow_rdd)
def main(root_path): timeStamp = str(int(time())) # todo change this for full run num = 1000 # 128915 is the total out_file_name = '../out/output-' + timeStamp + "-" + str(num) + '.txt' out_file = open(out_file_name, 'w') start = time() spark = init_spark() json_files = read_json_files(root_path, spark, num) data = get_body_text(spark, json_files) print("data reading done") # clean the data word_clean_up_F = F.udf(lambda x: clean_up(x), StringType()) data = data.withColumn("body_text_cleaned", word_clean_up_F("body_text")) data = data.select("body_text_cleaned") print("data processing done") tokenizer = Tokenizer(inputCol="body_text_cleaned", outputCol="words") token_DataFrame = tokenizer.transform(data) token_DataFrame = token_DataFrame.select("words") # Remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") cleaned_DataFrame = remover.transform(token_DataFrame) cleaned_DataFrame = cleaned_DataFrame.select('filtered') # Count vectorizer cv_tmp = CountVectorizer(inputCol="filtered", outputCol="count_features") cvmodel = cv_tmp.fit(cleaned_DataFrame) count_dataframe = cvmodel.transform(cleaned_DataFrame) count_dataframe = count_dataframe.select('count_features') # TF-IDF Vectorizer tfidf = IDF(inputCol="count_features", outputCol="features") tfidfmodel = tfidf.fit(count_dataframe) tfidf_dataframe = tfidfmodel.transform(count_dataframe).select("features") print("Ready to fit with the LDA model") # Fit the LDA Model num_topics = 5 max_iterations = 20 lda_start = time() lda = LDA(seed=1, optimizer="em", k=num_topics, maxIter=max_iterations) lda_model = lda.fit(tfidf_dataframe) lda_transformed = lda_model.transform(tfidf_dataframe) lda_end = time() print("LDA complete") # joblib.dump(lda_model, 'lda.csv') # Get terms per topic topics = lda_model.topicsMatrix() vocabArray = cvmodel.vocabulary wordNumbers = 15 # number of words per topic topicIndices = lda_model.describeTopics(maxTermsPerTopic=wordNumbers).rdd.map(tuple) topics_final = topicIndices.map(lambda topic: topic_render(topic, wordNumbers, vocabArray)).collect() for topic in range(len(topics_final)): print("Topic " + str(topic) + ":") print("Topic " + str(topic) + ":", file=out_file) print(topics_final[topic]) print(topics_final[topic], file=out_file) print("Full runtime : {} min. ".format((time() - start) / 60)) print("LDA runtime : {} min. ".format((lda_end - lda_start) / 60)) print("Check" + out_file.name) cleaned_DataFrame.cache() lda_transformed.cache() # Data Visualization data = format_data_to_pyldavis(cleaned_DataFrame, cvmodel, lda_transformed, lda_model) print("Preparing data with pyLDAvis ...") filter_bad_docs(data) py_lda_prepared_data = pyLDAvis.prepare(**data) file_name = '../out/data-viz-' + timeStamp + '.html' print("Saving pyLDAvis html page ...") pyLDAvis.save_html(py_lda_prepared_data, file_name) pyLDAvis.show(py_lda_prepared_data) spark.stop()
tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words")\ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) from pyspark.ml.feature import StopWordsRemover sentenceData = spark.createDataFrame( [(0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"])], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) from pyspark.ml.feature import NGram wordDataFrame = spark.createDataFrame( [(0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"])], ["id", "words"]) remover = StopWordsRemover(inputCol="words", outputCol="filtered") word = remover.transform(wordDataFrame).show(truncate=False) ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") ngramDataFrame = ngram.transform(word)
plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") # Plot the word cloud: plot_word_cloud(tokenized, "words") # ### Remove common (stop) words from each review # Note that the ride reviews contain a number of common words such as "the" # that we do not expect to be relevant. # Use the # [StopWordsRemover](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover) # class to remove these so-called *stop words*: from pyspark.ml.feature import StopWordsRemover remover = StopWordsRemover(inputCol="words", outputCol="relevant_words") remover.getStopWords()[:10] removed = remover.transform(tokenized) removed.select("words", "relevant_words").head(5) # Plot the word cloud: plot_word_cloud(removed, "relevant_words") # ### Count the frequency of words in each review # Use the # [CountVectorizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizer) # class to compute the term frequency: from pyspark.ml.feature import CountVectorizer vectorizer = CountVectorizer(inputCol="relevant_words", outputCol="word_count_vector", vocabSize=100)
#aprendizajemaquina_df.select(removerPuntuacionNumeros(col('value'))).show(truncate=False) #####, RegexTokenizer ######### tokenizer from pyspark.ml.feature import Tokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType tokenizer = Tokenizer(inputCol="value", outputCol="palabras") aprendizajemaquina_df = tokenizer.transform(aprendizajemaquina_df) aprendizajemaquina_df.select("value", "palabras").show(6, False) aprendizajemaquina_df.select("palabras").show(5, False) ######### stopwords from pyspark.ml.feature import StopWordsRemover listaConectores = StopWordsRemover.loadDefaultStopWords("spanish") remover = StopWordsRemover(inputCol="palabras", outputCol="filtro_conectores", stopWords=listaConectores) aprendizajemaquina_df = remover.transform(aprendizajemaquina_df) aprendizajemaquina_df.select("palabras").show(5, False) aprendizajemaquina_df.select("filtro_conectores").show(5, False) ############# Entrenamiento del modelo ###### Palabras a vectores Word2Vec from pyspark.ml.feature import Word2Vec from pyspark.ml import Pipeline w2v = Word2Vec(vectorSize=100, minCount=0,
def main(dict): filename = dict['filename'] savedmodelName = dict['modelname'] def myFunc(input): lines = input.split("\n") for line in lines: parts = line.split(";") Category = parts[-1] Sentence = parts[1] url_pattern = re.compile(r'(http[s]://[\w./]+)*') rt_pattern = re.compile('RT @\w+: ') r_pattern = re.compile('@\w+ ') Sentence = r_pattern.sub( r'', rt_pattern.sub(r'', url_pattern.sub(r'', Sentence))).replace( '\n', ' ').strip() return (Category, Sentence) file = sc.textFile("4CVTweets/" + filename) lines = file.map(myFunc) sentenceDataFrame = spark.createDataFrame(lines, ["label", "sentence"]) (trainingData, testData) = sentenceDataFrame.randomSplit([0.7, 0.3]) df = spark.createDataFrame([(0, "NO"), (1, "crash"), (2, "fire"), (3, "shooting")], ["id", "label"]) # start building the pineline # No: 0,Crash:1,Fire:2,Shooting:3 indexer = StringIndexer(inputCol="label", outputCol="categoryIndex") indexer.fit(df) tokenizer = RegexTokenizer(pattern="\\w+", inputCol="sentence", outputCol="words", gaps=False) remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) # # Compute the Inverse Document Frequency (IDF) given a collection of documents. rf = RandomForestClassifier(labelCol="categoryIndex", featuresCol="features", numTrees=100, maxDepth=10) # Using randomForest # mlr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8, family="multinomial",featuresCol="features",labelCol="categoryIndex") # Naive Bayers nb = NaiveBayes(labelCol="categoryIndex", featuresCol="features", smoothing=1) # converter = IndexToString(inputCol="prediction", outputCol="originalCategory") pipeline = Pipeline( stages=[indexer, tokenizer, remover, hashingTF, idf, nb]) model = pipeline.fit(trainingData) # Start to count accuracy to evaluate the model using just the offline model predictionsForTraining = model.transform(trainingData) predictionsForTraining.show(100, False) joindf = spark.createDataFrame([(0.0, "NO"), (1.0, "crash"), (2.0, "fire"), (3.0, "shooting")], ["prediction", "Predictlabel"]) innerjoin = predictionsForTraining.join( joindf, joindf.prediction == predictionsForTraining.prediction).drop( joindf.prediction) # innerjoin.select("label","categoryIndex","prediction","Predictlabel").show(1000,False) innerjoin.select("label", "Predictlabel").show(1000, False) evaluator1 = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy") accuracy = evaluator1.evaluate(predictionsForTraining) print("Test Accuracy = %g " % (accuracy)) print("Train Error = %g " % (1.0 - accuracy)) predictions = model.transform(testData) evaluator2 = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy") accuracy = evaluator2.evaluate(predictions) print("Test Accuracy = %g " % (accuracy)) print("Test Error = %g " % (1.0 - accuracy)) savePath = "tmp/pipeline/" + savedmodelName model.write().overwrite().save(savePath) print("model for Location", savedmodelName, "save successfully.")
# <h1>Tokenizer on a purticular column</h1> # In[46]: tokenized = tokenizer.transform(df_person) tokenized.select("desc", "words").withColumn("tokens", countTokens( col("words"))).show(truncate=False) # <h3>Stop Word Removal</h3> # In[55]: #Load Stop Word Remover from pyspark.ml.feature import StopWordsRemover remover = StopWordsRemover(inputCol="words", outputCol="filtered") remover.transform(tokenized).show(truncate=False) # <h1>Binary Tokenization Example</h1> # In[56]: from pyspark.ml.feature import Binarizer continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
def create_w2v_model(): spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .config("spark.executor.memory", "2g") \ .config("spark.driver.memory", "2g") \ .config("spark.memory.offHeap.enabled", True) \ .config("spark.memory.offHeap.size", "2g") \ .getOrCreate() input_file = spark.sparkContext.wholeTextFiles(PATH) print(""" Подготовка данных (1)... """) prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1]))) print(""" Подготовка данных (2)... """) df = prepared_data.toDF() print(""" Подготовка данных (3)... """) prepared_df = df.selectExpr('_2 as text') print(""" Разбитие на токены... """) tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) print(""" Очистка от стоп-слов... """) stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words) print(""" Построение модели... """) word2Vec = Word2Vec(vectorSize=50, inputCol='words', outputCol='result', minCount=2) model = word2Vec.fit(words) print(""" Сохранение модели... """) today = datetime.datetime.today() model_name = today.strftime("model/kurs_model") print(""" Model """ + model_name + """ saved """) model.save(model_name) spark.stop()
def Topic_Modeling(tweet_df): # Initializing Model for tokenizing the tweets for each user tokenizer = Tokenizer(inputCol="concat_ws( , collect_list(tweet))", outputCol="words") regexTokenizer = RegexTokenizer( inputCol="concat_ws( , collect_list(tweet))", outputCol="tokens", pattern="\\W+", minTokenLength=4) # udf for counting tokens countTokens = udf(lambda words: len(words), IntegerType()) # Tokenizing the data regexTokenized = regexTokenizer.transform(tweet_df) regexTokenized.select("user_id","concat_ws( , collect_list(tweet))", "tokens") \ .withColumn("token_count", countTokens(col("tokens"))).show() #print(regexTokenized.select("user_id","concat_ws( , collect_list(tweet))", "tokens") \ # .withColumn("token_count", countTokens(col("tokens"))).toPandas().head()) # Definig udf for steming use nltk porter stemmer p_stemmer = PorterStemmer() def stem(x): stemmed_tokens = [p_stemmer.stem(i) for i in x] return stemmed_tokens stem_udf = udf(lambda x: stem(x), ArrayType(StringType())) # Stemming tokens stemmedTokens.withColumn("Stemmed_tokens", stem_udf('tokens')) token_count.select("user_id", "concat_ws( , collect_list(tweet))", "tokens", "Stemmed_tokens").show() # Defining model for stopwords stopWords_remover = StopWordsRemover(inputCol="Stemmed_tokens", outputCol="filtered") default_StopWords = remover.getStopWords() default_StopWords.append("https") # Removing Stopwords filtered_df = stopWords_remover.transform(regexTokenized) filtered_df.withColumn("Pre_tokens", countTokens(col("Stemmed_tokens"))).withColumn( "Post_tokens", countTokens(col("filtered"))).show() #print(filtered_df.withColumn("Pre_tokens", countTokens(col("Stemmed_tokens"))).withColumn("Post_tokens", countTokens(col("filtered"))).toPandas().head()) # Defining model to convert text documents to vectors of token counts countVect = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=1000, minDF=5) model = countVect.fit(filtered_df) vectorizer = model.transform(filtered_df).select("user_id", "features") vectorizer.show(10) # Initializing LDA topic Modeling lda = LDA(k=10, maxIter=10) lda_model = lda.fit(vectorizer) #topics = lda_model.topicsMatrix() ll = lda_model.logLikelihood(vectorizer) lp = lda_model.logPerplexity(vectorizer) print("The lower bound on the log likelihood of the entire corpus: " + str(ll)) print("The upper bound on perplexity: " + str(lp)) # Describe topics. topics = lda_model.describeTopics(5) print("The topics described by their top-weighted terms:") topics.show(truncate=False) # UDF for formating the topics for desired usage zip_ = udf( lambda x, y: list(zip(x, y)), ArrayType( StructType([ # Adjust types to reflect data types StructField("first", IntegerType()), StructField("second", DoubleType()) ]))) topics_df=topics.withColumn("tmp", zip_("termIndices", "termWeights")).withColumn("tmp", explode("tmp"))\ .select("topic", col("tmp.first").alias("termIndices"), col("tmp.second").alias("termWeights")) # Extracting documents vocabulary vocab = model.vocabulary # UDF for extracting words for term indices asssigned to each topic words_ = udf(lambda x: vocab[x]) topics_df = topics_df.withColumn("Words", words_('termIndices')) topics_df = topics_df.groupBy("topic").agg( collect_list(col("Words")).alias("Words"), collect_list(col("termIndices")).alias("termIndices"), collect_list(col("termWeights")).alias("termWeights")) print("The topics described by their top-weighted terms:") print(topics_df.toPandas()) # Shows the result transformed = lda_model.transform(vectorizer) transformed.show(truncate=False) # UDF to extract top topic for each user max_value = udf(lambda x: max(x).item()) max_index = udf(lambda x: x.argmax().item()) User_Topics_df = transformed.withColumn( "Topic_Prob", max_value("topicDistribution")).withColumn( "Topic", max_index("topicDistribution")).select("user_id", "Topic_Prob", "Topic", "topicDistribution") User_Topics_df.show(truncate=False) # Number of users for each topic assigned to them User_Topics_df.groupBy("Topic").count().show() # saving userprofile in csv output user_profile = User_Topics_df.join(topics_df, User_Topics_df.Topic == topics_df.topic) user_profile.toPandas().to_csv('user_profile.csv')
tmp.append(i.value.split("||")[0]) tmp.append(i.value.split("||")[1]) data.append(tmp) print(len(data)) df = sqlContext.createDataFrame(data, schema=["category", "text"]) # regular expression tokenizer regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") # stop words stop_words = list(set(stopwords.words('english'))) stop_words_remover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(stop_words) # bag of words count count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_string_index = StringIndexer(inputCol="category", outputCol="label") label_string_index.setHandleInvalid("keep") pipeline = Pipeline(stages=[ regex_tokenizer, stop_words_remover, count_vectors, label_string_index ]) (training_data, test_data) = df.randomSplit([0.8, 0.2], seed=100) pipeline_fit = pipeline.fit(training_data) pipeline_fit.save("lr_pipeline")
def login(): message = '' e_result = '' s_result = '' t_result = '' j_result = '' if request.method == 'POST': post = request.form.get('text') # access the data inside if len(post) >= 100: test = pd.DataFrame([post], columns=['post']) newrows = [] def filter_text(post): """Decide whether or not we want to use the post.""" # should remove link only posts here return len(post) > 0 reg_punc = re.compile('[%s]' % re.escape(string.punctuation)) def preprocess_text(post): """Remove any junk we don't want to use in the post.""" # Remove links post = re.sub(r'http\S+', '', post, flags=re.MULTILINE) # All lowercase post = post.lower() # Remove puncutation post = reg_punc.sub('', post) return post def create_new_rows(row): posts = row['post'] rows = [] # for p in posts: p = preprocess_text(posts) rows.append({'post': p}) return rows for index, row in test.iterrows(): newrows += create_new_rows(row) test = pd.DataFrame(newrows) df = spark.createDataFrame(test) # Create a length column to be used as a future feature df = df.withColumn('length', length(df['post'])) types = [ 'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ' ] types = [x.lower() for x in types] tokenizer = Tokenizer(inputCol="post", outputCol="words") tokenized = tokenizer.transform(df) # Remove stop words stopwordList = types stopwordList.extend(StopWordsRemover().getStopWords()) stopwordList = list(set(stopwordList)) #optionnal remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwordList) newFrame = remover.transform(tokenized) # Run the hashing term frequency hashing = HashingTF(inputCol="filtered", outputCol="hashedValues") # Transform into a DF hashed_df = hashing.transform(newFrame) # Fit the IDF on the data set idf = IDF(inputCol="hashedValues", outputCol="idf_token") idfModel = idf.fit(hashed_df) rescaledData = idfModel.transform(hashed_df) # Create feature vectors #idf = IDF(inputCol='hash_token', outputCol='idf_token') clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') output = clean_up.transform(rescaledData) ei_model = NaiveBayesModel.load("static/models/EI_Predictor.h5") sn_model = NaiveBayesModel.load("static/models/SN_Predictor.h5") tf_model = NaiveBayesModel.load("static/models/TF_Predictor.h5") jp_model = NaiveBayesModel.load("static/models/JP_Predictor.h5") test_e = ei_model.transform(output) e = test_e.toPandas()["prediction"].values[0] if e == 0: e_result = "I" else: e_result = "E" test_s = sn_model.transform(output) s = test_s.toPandas()["prediction"].values[0] if s == 0: s_result = "N" else: s_result = "S" test_t = tf_model.transform(output) t = test_t.toPandas()["prediction"].values[0] if t == 0: t_result = "F" else: t_result = "T" test_j = jp_model.transform(output) j = test_j.toPandas()["prediction"].values[0] if j == 0: j_result = "P" else: j_result = "J" else: message = "Please tell us more about yourself!" return render_template('index.html', message=message, test_e=e_result, test_s=s_result, test_t=t_result, test_j=j_result)
# COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setGaps(False)\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ---------- from pyspark.ml.feature import StopWordsRemover englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover()\ .setStopWords(englishStopWords)\ .setInputCol("DescOut") stops.transform(tokenized).show() # COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram().setInputCol("DescOut").setN(1) bigram = NGram().setInputCol("DescOut").setN(2) unigram.transform(tokenized.select("DescOut")).show(False) bigram.transform(tokenized.select("DescOut")).show(False)
def main(): # read data yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True) data = yahoo.select(['sector', 'description']).dropna() # tokenize texts based on regular expression tokenize = RegexTokenizer(inputCol='description', outputCol='words_all', pattern='\\W') # remove stop words stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip() for f in ('mysql.txt', 'nltk.txt')).splitlines() remove_stopwords = StopWordsRemover( inputCol='words_all', outputCol='words_clean').setStopWords(stopwords) # get words frequency using simple count (bag of words) add_wordcount = CountVectorizer(inputCol='words_clean', outputCol='words_count', vocabSize=10000, minDF=5) # get tf-idf words frequencies add_wordtf = HashingTF(inputCol='words_clean', outputCol='words_tf', numFeatures=10000) add_wordidf = IDF(inputCol='words_tf', outputCol='words_tfidf', minDocFreq=5) # prepare output values index_target = StringIndexer(inputCol='sector', outputCol='label') # data preparation pipeline pipeline_wordcount = Pipeline(stages=[ tokenize, remove_stopwords, add_wordcount, add_wordtf, add_wordidf, index_target, ]) # apply data preparation pipeline prepared = pipeline_wordcount.fit(data).transform(data) # split to training and testing training, testing = prepared.randomSplit([0.7, 0.3], seed=100500) # fit logistic regression models logistic_wordcount = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_count', labelCol='label', predictionCol='prediction', probabilityCol='probability') logistic_tfidf = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_tfidf', labelCol='label', predictionCol='prediction', probabilityCol='probability') for model, name in ((logistic_wordcount, 'Word count + Logistic regression'), (logistic_tfidf, 'TF-IDF + Logistic regression')): predicted = model.fit(training).transform(testing) evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', metricName='accuracy') print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')
#### DATA DRIVEN APPROACH #### DATA DRIVEN APPROACH # In[21]: # we obtain the stop words from a website import requests stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split() len(stop_words) # In[22]: from pyspark.ml.feature import StopWordsRemover sw_filter = StopWordsRemover() .setStopWords(stop_words) .setCaseSensitive(False) .setInputCol("words") .setOutputCol("filtered") # In[23]: from pyspark.ml.feature import CountVectorizer # we will remove words that appear in 5 docs or less cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17) .setInputCol("filtered") .setOutputCol("tf") # In[24]: # we now create a pipelined transformer cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(airportCleanDF)
return spark.createDataFrame(row_rdd, ["label", "text"]) ## ## Define the pipeline stages ## ## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model
joinDF = clicksDF.join(jobsDF, clicksDF._3 == jobsDF._1, "inner") jobsfeatures = joinDF.map(lambda x: (cleanhtml(x[5] + ' ' + x[6]), x[ 0])) # concatenate job title and description, the 2nd is click ID jobsfeaturesDF = sqlContext.createDataFrame(jobsfeatures) # or jobsfeaturesDF =jobsfeatures.toDF() # Here if everything went well you see the no more HTML if running the command jobsfeaturesDF.take(1) to get 1 record # tokenizer to create a "terms" column so for example: # from _1 we have terms tokenizer = Tokenizer(inputCol="_1", outputCol="terms") termsData = tokenizer.transform(jobsfeaturesDF) # remover to remove stop words that don't contribute so for example # from terms we have filtered remover = StopWordsRemover(inputCol="terms", outputCol="filtered") filteredTermsData = remover.transform(termsData) # http://spark.apache.org/docs/latest/ml-features.html # Both HashingTF and CountVectorizer can be used to generate the term frequency vectors. # HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might # be a bag of words. HashingTF utilizes the hashing trick. # so from filtered we have rawFeatures tf = HashingTF(inputCol="filtered", outputCol="rawFeatures").transform(filteredTermsData) # IDF: IDF is an Estimator which is fit on a dataset and produces an IDFModel. The IDFModel takes feature vectors (generally created from HashingTF or # CountVectorizer) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus. idf = IDF(inputCol="rawFeatures", outputCol="features").fit(tf) # TF-IDF. Use tfidf.take(1) to see the 1st record tfidf = idf.transform(tf)
.withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) # StopWordsRemover from pyspark.ml.feature import StopWordsRemover sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="removeded") remover.transform(sentenceData).show(truncate=False) # NGram from pyspark.ml import Pipeline from pyspark.ml.feature import IDF, Tokenizer from pyspark.ml.feature import NGram sentenceData = spark.createDataFrame([ (0.0, "I love Spark"), (0.0, "I love python"), (1.0, "I think ML is awesome")], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") ngram = NGram(n=2, inputCol="words", outputCol="ngrams") idf = IDF(inputCol="rawFeatures", outputCol="features")
from pyspark import SparkContext, SparkConf from pyspark.sql.types import * import pandas as pd import pickle ICD9CODES = pickle.load(open("./data/ICD9CODES.p", "r")) ICD9CODES_TOP10 = pickle.load(open("./data/ICD9CODES_TOP10.p", "r")) ICD9CODES_TOP50 = pickle.load(open("./data/ICD9CODES_TOP50.p", "r")) ICD9CAT_TOP10 = pickle.load(open("./data/ICD9CAT_TOP10.p", "r")) ICD9CAT_TOP50 = pickle.load(open("./data/ICD9CAT_TOP50.p", "r")) from pyspark.ml.feature import StopWordsRemover STOPWORDS_v0 = StopWordsRemover.loadDefaultStopWords("english") + ICD9CODES STOPWORDS_v0 = [str(i) for i in STOPWORDS_v0] # print "TFIDF v0 stop words" # print STOPWORDS_v0 from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover def create_TFIDF_v0(trainData, applyData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20): tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words") wordsData1 = tokenizer.transform(trainData)
# Create contexts sc = SparkContext(appName="SparkWorkshop") sqlContext = SQLContext(sc) # Set up user defined functions and object for transformations expression = re.compile(r'<.*?>') parser = HTMLParser.HTMLParser() def strip_tags(html): return parser.unescape( expression.sub('', html) ) strip_tags_udf = udf(strip_tags) tokenizer = Tokenizer(inputCol="comment_clean", outputCol="words") stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens") # Load data comments = sqlContext.read.json(fn) # Calcualte tokens dataframe as one pipeline tokens = stopWordsRemover.transform( tokenizer.transform(comments\ .withColumn("comment_clean", strip_tags_udf(comments["comment_text"]))\ )\ )\ .select(explode("tokens").alias("token"))\ .groupBy("token")\ .count()\ .orderBy("count", ascending=False)\ .select("count")\
# %% #Computing setniment column based on rating sentiment = when(col("rating") <= 5, 0).otherwise(1) df = df.withColumn("sentiment", sentiment) df = df.withColumn('length', length(df['review'])) # %% [markdown] # ## Feature Transformation # %% from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer tokenizer = Tokenizer(inputCol="review", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") pos_neg = StringIndexer(inputCol='sentiment', outputCol='label') # %% from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vector # %% clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') # %% from pyspark.ml.classification import NaiveBayes from pyspark.ml.classification import LinearSVC
pos_rdd = pos.map( lambda p: Row(text=p.encode('utf-8').strip(), label=float(1.0))) neg_rdd = neg.map( lambda n: Row(text=n.encode('utf-8').strip(), label=float(0.0))) pos_all = spark.createDataFrame(pos_rdd).withColumn( "label", lit(1.0)).withColumn("id", monotonically_increasing_id()) neg_all = spark.createDataFrame(neg_rdd).withColumn( "label", lit(0.0)).withColumn("id", monotonically_increasing_id()) training = pos_all.unionAll(neg_all) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="relevant_words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") #countVector = CountVectorizer(inputCol="relevant_words", outputCol="features", vocabSize=10000, minDF=2.0) # lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # pipeline_lr = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr]) pipeline_nb = Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb]) # Fit the pipeline to training documents. # model_lr = pipeline_lr.fit(training) model_nb = pipeline_nb.fit(training) # Reading data from kafka-topic1
def preprocess_tweets(tweets): tokenizer = Tokenizer(inputCol="text", outputCol="words") tweets = tokenizer.transform(tweets) remover = StopWordsRemover(inputCol="words", outputCol="filtered") tweets = remover.transform(tweets) return tweets
def do_query(issues, config_file=None, logger=None, context=None): """ Gets the Latent Dirochelet Allocation (LDA) topics for words within articles. config_file must be the path to a LDA configuration file in YAML format. For example: keyword: <KEYWORD> optimizer: online|em max_iterations: <N> ntopics: <N> topic_words: <N> <N> must be >= 1 for each parameter. The keyword and words in documents are normalized, by removing all non-'a-z|A-Z' characters. Returns result of form: { <0>: [<WORD_0>, ..., <WORD_topicwords>], <1>: [<WORD_0>, ..., <WORD_topicwords>], <2>: [<WORD_0>, ..., <WORD_topicwords>], ... <ntopics>: [<WORD_0>, ..., <WORD_topicwords>], years:[<MIN_YEAR>, <MAX_YEAR>] } :param issues: RDD of defoe.papers.issue.Issue :type issues: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: LDA topics :rtype: dict """ with open(config_file, 'r') as f: config = load(f) keyword = config['keyword'] optimizer = config['optimizer'] if optimizer != 'online' and optimizer != 'em': raise ValueError("optmizer must be 'online' or 'em' but is '{}'" .format(optimizer)) max_iterations = config['max_iterations'] if max_iterations < 1: raise ValueError('max_iterations must be at least 1') ntopics = config['ntopics'] if ntopics < 1: raise ValueError('ntopics must be at least 1') topic_words = config['topic_words'] if topic_words < 1: raise ValueError('topic_words must be at least 1') keyword = query_utils.normalize(keyword) # [date, ...] # => # [(yesr, year), ...] # => # (year, year) min_year, max_year = issues \ .filter(lambda issue: issue.date) \ .map(lambda issue: (issue.date.year, issue.date.year)) \ .reduce(min_max_tuples) # [issue, issue, ...] # => # [article, article, ...] # => # [(article, 0), (article, 1), ...] # => # [Row, Row, ...] articles_rdd = issues.flatMap(lambda issue: issue.articles) \ .filter(lambda article: article_contains_word(article, keyword, PreprocessWordType.NORMALIZE)) \ .zipWithIndex() \ .map(article_idx_to_words_row) spark = SparkSession \ .builder \ .appName('lda') \ .getOrCreate() articles_df = spark.createDataFrame(articles_rdd) remover = StopWordsRemover(inputCol='words', outputCol='filtered') articles_df = remover.transform(articles_df) vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors') model = vectortoriser.fit(articles_df) vocabulary = model.vocabulary articles_df = model.transform(articles_df) corpus = articles_df \ .select('idx', 'vectors') \ .rdd \ .map(lambda a: [a[0], Vectors.fromML(a[1])]) \ .cache() # Cluster the documents into N topics using LDA. lda_model = LDA.train(corpus, k=ntopics, maxIterations=max_iterations, optimizer=optimizer) topics_final = [topic_render(topic, topic_words, vocabulary) for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words)] topics = [('years', [min_year, max_year])] for i, topic in enumerate(topics_final): term_words = [] for term in topic: term_words.append(term) topics.append((str(i), term_words)) return topics
from pyspark.sql import SQLContext from pyspark.sql.functions import desc, explode from pyspark.sql.types import * from storage import Sqlite PARTITIONS = 500 THRESHOLD = 50 if __name__ == "__main__": conf = SparkConf().setAppName("reddit") conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf.set('spark.local.dir', '/mnt/work') conf.set('spark.driver.maxResultSize', '12g') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) fields = [StructField("subreddit", StringType(), True), StructField("body", StringType(), True)] rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields)) # split comments into words tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(rawDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(wordsDataFrame) # explode terms into individual rows termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")]) # group by subreddit and term, then count occurence of term in subreddits countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count() db = Sqlite() countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
sys.exit(1) # Transform data into ready-format df_train = (text_data.fillna("").select( concat(col("title"), lit(" "), col("abstract"), lit(" "), col("full_text")).alias("text"))) # Create pipeline objects document_assembler = DocumentAssembler().setInputCol("text").setOutputCol( "document") tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalizer") stemmer = Stemmer().setInputCols(["normalizer"]).setOutputCol("stem") finisher = Finisher().setInputCols(["stem"]).setOutputCols( ["to_spark"]).setValueSplitSymbol(" ") stopword_remover = StopWordsRemover(inputCol="to_spark", outputCol="filtered") tf = CountVectorizer(inputCol="filtered", outputCol="raw_features") idf = IDF(inputCol="raw_features", outputCol="features") lda = LDA(k=10, maxIter=10) # Create pipeline pipeline = Pipeline(stages=[ document_assembler, tokenizer, normalizer, stemmer, finisher, stopword_remover, tf, idf, lda ]) model = pipeline.fit(df_train) vocab = model.stages[-3].vocabulary raw_topics = model.stages[-1].describeTopics().collect() topic_inds = [ind.termIndices for ind in raw_topics]
"spark.sql.warehouse.dir", '/user/hive/warehouse').enableHiveSupport().getOrCreate() data = sc.textFile(trainingData) header = data.first() rdd = data.filter(lambda row: row != header) r = rdd.mapPartitions(lambda x: csv.reader(x)) r = r.map(lambda x: (processTweet(x[3]), int(x[1]))) r = r.map(lambda x: Row(sentence=x[0], label=int(x[1]))) df = spark.createDataFrame(r).orderBy(rand()).limit(500000) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="base_words") hashingTF = HashingTF(numFeatures=10000, inputCol="base_words", outputCol="features") lr = LogisticRegression(maxIter=10000, regParam=0.001, elasticNetParam=0.0001) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) splits = df.randomSplit([0.6, 0.4], 223) trainSet = splits[0] testSet = splits[1] lrModel = pipeline.fit(trainSet) lrResult = lrModel.transform(testSet)
df = spark.read.option('delimiter', '\t').csv('items_data.tsv', header=True) # Insert items id df = df.withColumn("id", monotonically_increasing_id()) # Init the tokenizer tokenizerR = RegexTokenizer(inputCol='description', outputCol='tokenized', pattern='\\W') # Tokenize the item description text df = tokenizerR.transform(df) # Create the italian stopwords collection stop_words = list(stopwords.words('italian')) # Remove the italian stopwords from the item description remover = StopWordsRemover(inputCol="tokenized", outputCol="stopwords_removed", stopWords=stop_words) df = remover.transform(df) # Last preprocess operations ita_stemmer = ItalianStemmer() eng_stemmer = PorterStemmer() # Make the last preprocessing operations def text_preprocessing(tokens): # Remove tokens composed by only a number filtered_token = [token for token in tokens if not re.search(r'\b[0-9]+\b\s*', token)] # Stem the tokens for both italian and english filtered_token = [ita_stemmer.stem(token) for token in filtered_token] filtered_token = [eng_stemmer.stem(token) for token in filtered_token]
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import StopWordsRemover # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("StopWordsRemoverExample")\ .getOrCreate() # $example on$ sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) # $example off$ spark.stop()
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' ')) # Merge multiple spaces wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' ')) # Split the text into words wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled) wrangled.show(4, truncate=False) -------------------------------------------------- # Exercise_11 from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\ .transform(wrangled) # Convert hashed symbols to TF-IDF tf_idf = IDF(inputCol='hash', outputCol='features')\ .fit(wrangled).transform(wrangled) tf_idf.select('terms', 'features').show(4, truncate=False) -------------------------------------------------- # Exercise_12 # Split the data into training and testing sets
# using 1000 records as a small set debugging data train_sents1 = train_df.select('genre', 'sentence1') train_sents2 = train_df.select('genre', 'sentence2') # train_sents1.show(5) udf_lower = F.udf(lower_folding, StringType()) train_sents1_lower = train_sents1.withColumn('lower_sents', udf_lower('sentence1')) # train_sents1_lower.show(5) udf_rv_punc = F.udf(remove_punctuation_re, StringType()) train_sents1_rv_punc = train_sents1_lower.withColumn( 'rv_punc_sents', udf_rv_punc('lower_sents')) tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens") w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="filtered_tokens", outputCol="avg_word_embed") doc2vec_pipeline = Pipeline(stages=[tokenizer, remover, w2v]) doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc) doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc) w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2]) from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator genre2label = StringIndexer(inputCol="genre", outputCol="label")
""" from pyspark.ml.feature import StopWordsRemover sentenceDataFrame= spark.createDataFrame([ (0,['I','saw','the','green','horse']), (1,['Mary','had','a','little','lamb']) ],['id','tokens']) sentenceDataFrame.show() """ +---+--------------------+ | id| tokens| +---+--------------------+ | 0|[I, saw, the, gre...| | 1|[Mary, had, a, li...| +---+--------------------+ """ remover= StopWordsRemover(inputCol='tokens',outputCol='filtered') remover.transform(sentenceDataFrame).show() """ +---+--------------------+--------------------+ | id| tokens| filtered| +---+--------------------+--------------------+ | 0|[I, saw, the, gre...| [saw, green, horse]| | 1|[Mary, had, a, li...|[Mary, little, lamb]| +---+--------------------+--------------------+ """ # n-gram from pyspark.ml.feature import NGram wordDataFrame= spark.createDataFrame([ (0,["Hi", "I", "heard", "about", "Spark"]), (1,["I", "wish", "java", "could", "use", "case", "classes"]), (2,["Logistic", "regression", "models", "are", "neat"]),