def shringles(x, fileName): # tokenize and ngrams tokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W") ngrams = NGram(n=x, inputCol="words", outputCol="kshringles") shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
def aggregate_spark(data, features, args): import pyspark.sql.functions as F from pyspark.ml.feature import StopWordsRemover, RegexTokenizer input_data = data.withColumn(features["col"], F.lower(F.col(features["col"]))) regexTokenizer = RegexTokenizer(inputCol=features["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) remover = StopWordsRemover(inputCol="token_list", outputCol="filtered_word_list") vocab_rows = (remover.transform(regexTokenized).select( F.explode(F.col("filtered_word_list")).alias("word")).groupBy( "word").count().orderBy(F.col("count").desc()).limit( args["vocab_size"]).select("word").collect()) vocab = [row["word"] for row in vocab_rows] reverse_dict = { word: idx + len(args["reserved_indices"]) for idx, word in enumerate(vocab) } return {**reverse_dict, **args["reserved_indices"]}
def df_to_words(logger, df: DataFrame, input_col: str, output_col: str = "words", pattern: str = "\\W+", to_lowercase: bool = True, case_sensitive: bool = False) -> DataFrame: """ Take each string in a column and parse it to a list of words via Tokenization and remove stop words. Args: logger: Logger instance used to log events df: Dataframe used input_col: Selected input column name output_col: Output column name pattern: The regex pattern used to tokenized to_lowercase: If all the word should be trim case_sensitive: Does the stop words should be case sensitive Returns: The modified dataframe """ try: intermediate_output = output_col + "intermediate" regex_tokenizer = RegexTokenizer(inputCol=input_col, outputCol=intermediate_output, pattern=pattern, toLowercase=to_lowercase) remover = StopWordsRemover(inputCol=intermediate_output, outputCol=output_col, caseSensitive=case_sensitive) logger.info("Parsing to words the dataframe") return remover.transform(regex_tokenizer.transform(df)).drop(intermediate_output) except Exception as e: logger.error("Parsing to words failed: {}".format(e), traceback.format_exc()) raise e
def extract_tokens(df): reTokenizer = RegexTokenizer(inputCol="Text", outputCol="clean_text", toLowercase=True, minTokenLength=3) newdf = reTokenizer.transform(df) return newdf
def frequency_vector_DataFrame(trainDF, cluster_count): regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]") dfTokenizer = regTokenizer.transform(trainDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") df_remover = remover.transform(dfTokenizer) # feature extraction using Word2vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec") vectors = word2Vec.fit(df_remover).getVectors() vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features")) # DF as kmeans kmeans = KMeans().setK(cluster_count).setSeed(1) km_model = kmeans.fit(vectors_DF) # Broadcast operation after getting the words and predictions vocabDF = km_model.transform(vectors_DF).select("word", "prediction") vocabDict = dict(vocabDF.rdd.collect()) vocab_dict = sc.broadcast(vocabDict) # Cluster vector is in RDD form reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict)) cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count)) cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF() cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label")) return cluster_freq_featureDF
def get_feature(dataframe=df_train_x, nFeature=200): # convert the input string to lowercase and then split it by regex pattern regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") words_data = regexTokenizer.transform(dataframe) #count_tokens = udf(lambda words: len(words), IntegerType()) # count the number of words in each review #words_data.select("words").withColumn("tokens", count_tokens(col("words"))).show(5,truncate=True) # remove stop words (e.g the, who, which, at, on, I) stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="words_removed") words_removed_data = stopWordsRemover.transform(words_data) #count_tokens_new = udf(lambda words_removed: len(words_removed), IntegerType()) #words_removed_data.select("words_removed").withColumn("tokens_new", count_tokens_new(col("words_removed"))).show(5,truncate=True) # transform input features into n-grams #nGram = NGram(n=2, inputCol="words_removed", outputCol="ngrams") #ngrams_data = nGram.transform(words_removed_data) # transform list of words to words frequency vectors hashingTF = HashingTF(inputCol="words_removed", outputCol="words_freq", numFeatures=nFeature) words_freq_data = hashingTF.transform(words_removed_data) #words_freq_data.select("words_freq").show(5,truncate=True) # compute the IDF vector and scale words frequencies by IDF idf = IDF(inputCol="words_freq", outputCol="features") idf_model = idf.fit(words_freq_data) feature_data = idf_model.transform(words_freq_data).select("features") return feature_data
def preprocessDF(self, df, cols): df = df.withColumn('joined_columns', functions.lower(df[cols[0]])) regex_tokenizer = RegexTokenizer(inputCol="joined_columns", outputCol="joinKey", pattern=r'\W+') df = regex_tokenizer.transform(df) return df
def dedup_min_hash(df, column, id_col, min_distance=0.1): """ Deduplicates a dataset using MinHash on a token count basis. Removes all items with a distance smaller than min_distance. """ @udf("long") def num_nonzeros(v): return v.numNonzeros() df.cache() tokenizer = RegexTokenizer(inputCol=column, outputCol="tokens") tokens = tokenizer.transform(df) cv = CountVectorizer(inputCol="tokens", outputCol="token_ids") vectorizer_model = cv.fit(tokens) with_token_ids = vectorizer_model.transform(tokens).drop("tokens", column) with_token_ids = with_token_ids.where( num_nonzeros(with_token_ids.token_ids) > 0).cache() mh = MinHashLSH(inputCol="token_ids", outputCol="hashes", seed=1, numHashTables=10) dedup_model = mh.fit(with_token_ids) joined = dedup_model.approxSimilarityJoin(with_token_ids, with_token_ids, 1 - min_distance, distCol="dist")\ .drop("token_ids", "hashes")\ .filter(f"datasetA.{id_col} < datasetB.{id_col}") duplicate_ids = joined.rdd.flatMap(lambda row: (row.datasetA[id_col], row.datasetB[id_col]))\ .distinct()\ .map(lambda el: [el])\ .toDF() return df.join(duplicate_ids, duplicate_ids._1 == df[id_col], "left")\ .where(duplicate_ids._1.isNotNull())\ .drop(duplicate_ids._1)
def create_TFIDF_v0(trainData, applyData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20): tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words") wordsData1 = tokenizer.transform(trainData) wordsData2 = tokenizer.transform(applyData) remover = StopWordsRemover(inputCol="z_words", outputCol="z_filtered", stopWords=STOPWORDS_v0) wordsDataFiltered1 = remover.transform(wordsData1) wordsDataFiltered2 = remover.transform(wordsData2) hashingTF = HashingTF(inputCol="z_filtered", outputCol="z_rawFeatures", numFeatures=numFeatures) featurizedData1 = hashingTF.transform(wordsDataFiltered1) featurizedData2 = hashingTF.transform(wordsDataFiltered2) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="z_rawFeatures", outputCol=outputCol, minDocFreq=minDocFreq) idfModel = idf.fit(featurizedData1) rescaledData = idfModel.transform(featurizedData2) return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures", inputCol)
def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] comments = spark.read.json(in_directory, schema=schema) comments.cache() wordbreak = r'[%s\s]+' % (re.escape(string.punctuation + '0123456789'),) # NLP processing code adapted from https://spark.apache.org/docs/latest/ml-features.html regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", minTokenLength=3, pattern=wordbreak) # alternatively, pattern="\\w+", gaps(False) countTokens = udf(lambda words: len(words), IntegerType()) regexTokenized = regexTokenizer.transform(comments) docs = regexTokenized.select("body", "words", "subreddit") docs.cache() #extra_stop_words = ["www","http","gt"] remover = StopWordsRemover(inputCol="words", outputCol="filtered") docs = remover.transform(docs).withColumn("tokens", countTokens(col("filtered"))) docs = docs.drop("body") docs = docs.drop("words") docs.groupBy("subreddit").agg(functions.avg("tokens")).show() # threshold for post length lthresh = 60 uthresh = 100 docs = docs.filter(docs['tokens'] > lthresh) docs = docs.filter(docs['tokens'] < uthresh) logs = docs.groupBy("subreddit").agg(functions.count("*")).show() #adds rank per subreddit type into a new column called rank ranked = docs.withColumn("rank", rank().over(Window.partitionBy("subreddit").orderBy(desc("tokens")))) #ranked.cache() group_size = 230 #take group_size biggest docs from each group type ranked = ranked.filter(ranked['rank'] <= group_size) #convert arrays to columns so we can write csv for i in range(uthresh): ranked = ranked.withColumn('{0}'.format(i), ranked.filtered.getItem(i)) #drop filtered so we can write to csv ranked = ranked.drop('filtered') ranked = ranked.drop('rank') ranked.show() ranked.write.csv(out_directory, mode='overwrite')
def createToken(self, dataset, colName): dataset = dataset.drop(pc.DMXTOKENIZED) sentimentTokenizer = RegexTokenizer(inputCol=colName, outputCol=pc.DMXTOKENIZED, toLowercase=True, pattern="\\W") dataset = sentimentTokenizer.transform(dataset) return dataset
def createToken(self, dataset, colName): dataset = dataset.drop("SA_tokenized") sentimentTokenizer = RegexTokenizer(inputCol=colName, outputCol="SA_tokenized", toLowercase=True, pattern="\\W") dataset = sentimentTokenizer.transform(dataset) return dataset
def wordTokenizer(data, columns): for c in columns: new_c = c + '_tokens' reTokenizer = RegexTokenizer(inputCol=c, outputCol=new_c, pattern='\\W', minTokenLength=2) data = reTokenizer.transform(data) return data
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = SVMWithSGD.train(training, iterations=100) model_name = "svm" + str(counter_model) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) counter_model.add(1) end = time.time() print("Model Name : ", model_name, ", Total Reviews : ", reviews.count(), "Processing Time : ", (end - start))
def tokenise_concat_field(df, spark): """ Take the 'concat' field, which contains the whole record as a single string and split it into an array of tokens """ tokenizer = RegexTokenizer( inputCol="concat", outputCol="tokens", pattern="[\s\-\.@]") df = tokenizer.transform(df) # df = df.drop('concat') # Needed later for overall edit distance! return df
def tokenize_sentences(sentences_df): """ Used Spark ML tokenizer to tokenize each sentence :param sentences_df: one sentence per row :returns: same data frame with added column with tokenized array """ regexTokenizer = RegexTokenizer(inputCol="sentenceText", outputCol="words", pattern="\\W") tokenized = regexTokenizer.transform(sentences_df) return tokenized
def tokenize(p_df, in_column, out_column): """ Tokenizes a column in a DataFrame. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ tokenizer = RegexTokenizer(inputCol=in_column, outputCol=out_column, pattern="\\W") return tokenizer.transform(p_df)
def process(reviews): if(reviews.isEmpty()): pass else: model_name = "dt" updated_model = "dt0" model_path, data_path, metadata_path = '','','' #performing looping process to check the availability of new model classifier for i in range(25,-1,-1): model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i) updated_model = model_name+str(i) data_path = model_path+"/data/part-r*" metadata_path = model_path+"/metadata/part-00000" if(patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = DecisionTreeModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) prediction = model.predict(tfidf) labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0])) metrics = MulticlassMetrics(labeled_prediction) output = reviews.zip(prediction) filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out" output.saveAsTextFile(filename) end = time.time() print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
def createToken(self, dataset, colName): sentimentTokenizer = RegexTokenizer( inputCol=colName, outputCol=self.dmxTokenized, toLowercase=True, pattern="\\W" ) # update the constant with in predictive constant class. dataset = sentimentTokenizer.transform(dataset) dataset = self.stopWordsRemover(dataset, self.dmxTokenized) return dataset
def preprocessDF(self): reTokenizer = RegexTokenizer(pattern=r'\W+', inputCol='reformat2', outputCol='tokenKey', toLowercase=True) df_token = reTokenizer.transform(self.df) remover = StopWordsRemover(inputCol='tokenKey', outputCol='tokens') remover.setStopWords(list(self.stopWords)) df_token = remover.transform(df_token) df_token=df_token.select('categories','created_at','favorite_count','quote_count','reply_count','retweet_count','user.followers_count', 'user.favourites_count','user.friends_count','tokens') self.df1.write.json(os.path.join(bdenv_loc,'twitter_parse_reformat_2018_01-06.json'), mode='append') df_token.write.json(os.path.join(bdenv_loc,'twitter_parse_tokens_2018_01-06.json'), mode='append')
def tokenize(self, df1, input_col, output_col='words', pattern=None): "Tokenize string -> https://spark.apache.org/docs/2.2.0/ml-features.html#tokenizer" from pyspark.ml.feature import Tokenizer, RegexTokenizer if pattern: tokenizer = RegexTokenizer( inputCol=input_col, outputCol=output_col, pattern=pattern) else: tokenizer = Tokenizer(inputCol=input_col, outputCol=output_col) tokenized_df = tokenizer.transform(df1) return tokenized_df
def tokenize(df, string_cols): output = df for c in string_cols: output = output.withColumn('temp', f.coalesce(f.col(c), f.lit(''))) tokenizer = RegexTokenizer(inputCol='temp', outputCol=c + "_tokens", pattern="\\W") remover = StopWordsRemover(inputCol=c + "_tokens", outputCol=c + "_swRemoved") output = tokenizer.transform(output) output = remover.transform(output)\ .drop('temp', c+"_tokens") return output
def aggregate_spark(data, input): from pyspark.ml.feature import RegexTokenizer import pyspark.sql.functions as F from pyspark.sql.types import IntegerType regexTokenizer = RegexTokenizer(inputCol=input, outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) max_review_length_row = (regexTokenized.select( F.size(F.col("token_list")).alias("word_count")).agg( F.max(F.col("word_count")).alias("max_review_length")).collect()) return max_review_length_row[0]["max_review_length"]
def test_preprocessing(self, content, cv): ## Load file X_files = sc.textFile('gs://chatrath/files/X_test.txt') X_asm_files = X_files.map(lambda x: (("gs://chatrath/data/asm/" + x + ".asm"))) X_asm_files = X_asm_files.reduce(lambda x, y: x + "," + y) X_asm = sc.wholeTextFiles(X_asm_files) X_test_asm = X_asm.mapValues(lambda x: re.sub("""[\t{Z}]""", "", x)) X_test_asm = X_test_asm.mapValues( lambda x: re.sub("""[+{Z}]+""", "", x)) X_test_asm = X_test_asm.mapValues( lambda x: re.sub("""[-{Z}]+""", "", x)) X_test_asm = X_test_asm.mapValues( lambda x: re.sub("""[={Z}]+""", "", x)) X_test_asm = X_test_asm.mapValues( lambda x: re.sub("""[\r|{Z}]+""", "", x)) X_test_asm = X_test_asm.mapValues( lambda x: re.sub("""[;{Z}]+""", "", x)) X_test_asm = X_test_asm.mapValues( lambda x: re.sub("""[\n{Z}]+""", "", x)) X_test_asm = X_test_asm.mapValues(lambda x: x.split()) ## Filter out opcodes X_test_asm = X_test_asm.mapValues( lambda x: list(filter(lambda y: y in content, x))) X_test_asm = X_test_asm.mapValues(lambda x: " ".join(map(str, x))) X_test_asm = X_test_asm.map(lambda x: (x[0].split("/")[-1].split(".")[0], x[1])) ## Create test dataframe testdata = X_test_asm.map( lambda x: Row(filename=x[0], data=x[1])).toDF() ## Tokenizing data regexToken = RegexTokenizer(inputCol="data", outputCol="words", pattern="\\W") asm_df = regexToken.transform(testdata) asm_df = asm_df.drop('data') ## Using CountVectorizer to extract features # countvector = CountVectorizer(inputCol="words", outputCol="features") # cv = countvector.fit(asm_df) testdata = cv.transform(asm_df) # traindata = asm_df.withColumn('label', resultantdf['label'].cast('int')) return testdata
def topicPredict(inputs): #output_path = "/user/llbui/bigdata45_500" output_path = "C:/Users/linhb/bigdata45_500" query = inputs n = 10 #number of similar document to return feature = "abstract" #feature to compare df = sc.parallelize([(0, query)]).toDF(["id", feature]) tokenizer = RegexTokenizer(inputCol=feature, outputCol="words", pattern="\\P{Alpha}+") df2 = tokenizer.transform(df) remover = StopWordsRemover(inputCol="words", outputCol="words2") df3 = remover.transform(df2) udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType())) df4 = df3.withColumn("words3", udf_remove_words(df3.words2)) # text to feature vector - TF_IDF countTF_model = CountVectorizerModel.load(output_path + "/tf_model") df_countTF = countTF_model.transform(df4) idf_model = IDFModel.load(output_path + "/idf_model") df_IDF = idf_model.transform(df_countTF) # LDA Model lda_model = LocalLDAModel.load(output_path + "/lda_model") #output topics for document -> topicDistribution df_Feature = lda_model.transform(df_IDF) feature_vector = df_Feature.select("id", "topicDistribution").collect()[0][1] print("Feature Vector:", feature_vector) #Load existing document df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet") udf_cosineSimilarity = udf( lambda x_vector: cosineSimilarity(x_vector, feature_vector), FloatType()) df_Similarity = df_Document.withColumn( "similarity", udf_cosineSimilarity("topicDistribution")) df_Similarity_Sorted = df_Similarity.sort(desc("similarity")) return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract", "url", "topicDistribution").collect()
def aggregate_spark(data, features, args): from pyspark.ml.feature import StopWordsRemover, RegexTokenizer import pyspark.sql.functions as F from pyspark.sql.types import IntegerType regexTokenizer = RegexTokenizer(inputCol=features["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) remover = StopWordsRemover(inputCol="token_list", outputCol="filtered_word_list") max_review_length_row = (remover.transform(regexTokenized).select( F.size(F.col("filtered_word_list")).alias("word_count")).agg( F.max(F.col("word_count")).alias("max_review_length")).collect()) return max_review_length_row[0]["max_review_length"]
def tokenize(df, column): """ Tokenize alpha-numeric words. Set all tokens to lower-case and remove short terms having less than 3 characters. """ # creates tokenizer based on regular expressions wordTokenizer = RegexTokenizer( inputCol=column, outputCol='_'+column, pattern='\w+' ).setGaps(False) # match tokens rather than gaps # transform: string --> array<string> df = wordTokenizer.transform(df) df = replace(df, column, '_'+column) return df
def preprocessing(self, resultantdf, cv): #removal of linenumber from each file data using regural expression resultantdf = resultantdf.withColumn( 'data', F.regexp_replace('data', '\\b\\w{3,}\\s', '')) #using inbuilt regexTokenizer api to tokenize the data regexTokenizer = RegexTokenizer(inputCol="data", outputCol="words", pattern="\\W") resultantdf = regexTokenizer.transform(resultantdf) resultantdf = resultantdf.drop('data') #not required column dropped # bag of words count usinf count vectorizer resultantdf = cv.transform(resultantdf) return (resultantdf)
def preprocessDF(self, df, cols): # concatenation df_concat = df.withColumn("concat", concat_ws(' ', *cols)) # Split at whitespace and characters that are not letter tokenizer = RegexTokenizer(inputCol="concat", outputCol="words", pattern=r'\W+') df_tokenizer = tokenizer.transform(df_concat) # stopword remover remover = StopWordsRemover(inputCol="words", outputCol="joinKey", stopWords=self.stopWordsBC.value) df_remover = remover.transform(df_tokenizer) \ .drop("concat").drop("words") return df_remover
def aggregate_spark(data, input): import pyspark.sql.functions as F from pyspark.ml.feature import RegexTokenizer regexTokenizer = RegexTokenizer(inputCol=input["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) vocab_rows = (regexTokenized.select( F.explode(F.col("token_list")).alias("word")).groupBy( "word").count().orderBy(F.col("count").desc()).limit( input["vocab_size"]).select("word").collect()) vocab = [row["word"] for row in vocab_rows] reverse_dict = {word: 2 + idx for idx, word in enumerate(vocab)} reverse_dict["<PAD>"] = 0 reverse_dict["<UNKNOWN>"] = 1 return reverse_dict
def bayes_cv(business_id): """ Crossvalidation of bayes model """ spark = yelp_lib.spark review = yelp_lib.get_parq('review') business_df = review.filter(review['business_id'] == business_id) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") wordsDataFrame = regexTokenizer.transform(business_df) remover = StopWordsRemover(inputCol="words", outputCol="filtered") cleaned = remover.transform(wordsDataFrame) star_mapping = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0} cleaned = cleaned.replace(star_mapping, 'stars') cleaned = cleaned.withColumn("stars", cleaned["stars"].cast("double")) cv = CountVectorizer(inputCol="filtered", outputCol="features") model = cv.fit(cleaned) vectorized = model.transform(cleaned) vectorized = vectorized.select( col('stars').alias('label'), col('features')) splits = vectorized.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0) # train the model nb_model = nb.fit(train) # compute accuracy on the test set result = nb_model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") return "Accuracy: " + str(evaluator.evaluate(predictionAndLabels))
from pyspark.ml.feature import Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setGaps(False)\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ----------
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("TokenizerExample")\ .getOrCreate() # $example on$ sentenceDataFrame = spark.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) tokenized = tokenizer.transform(sentenceDataFrame) for words_label in tokenized.select("words", "label").take(3): print(words_label) regexTokenized = regexTokenizer.transform(sentenceDataFrame) for words_label in regexTokenized.select("words", "label").take(3): print(words_label) # $example off$ spark.stop()
# MAGIC Split the Wikipedia text into sentences. # COMMAND ---------- pattern = r"(\. |\n{2,})" import re matches = re.findall(pattern, "Wiki page. *More information*\n\n And a line\n that continues.") print matches # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer(inputCol="text", outputCol="sentences", pattern=pattern) sentences = tokenizer.transform(parsed).select("sentences") display(sentences) # COMMAND ---------- from pyspark.sql import Row from pyspark.sql.types import StructType, StructField, StringType sentenceRDD = sentences.flatMap(lambda r: r[0]).map(lambda x: Row(sentence=x)) sentenceSchema = StructType([StructField("sentence", StringType())]) sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema) display(sentence) # COMMAND ----------
from pyspark.ml.feature import RegexTokenizer, HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import RandomForest ## Load Dataset df_pandas = pd.read_csv('sample.csv') ## Convert to Spark Dataframe sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(df_pandas) ## Tokenizer and Hashing tokenizer = RegexTokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features") df_feat = hashingTF.transform(tokenizer.transform(df)) ## Create LabeledPoint and Features for Prediction (predict the 1s observations) lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features)) predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features) ## Compare predictions from Different Models ## Logistic Regression lrm = LogisticRegressionWithSGD.train(lp, iterations=10) logit_predict = lrm.predict(predict_feat) logit_predict.sum() #9112
# MAGIC %md # MAGIC Split the Wikipedia text into sentences. # COMMAND ---------- pattern = r'(\. |\n{2,})' import re matches = re.findall(pattern, 'Wiki page. *More information*\n\n And a line\n that continues.') print matches # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer(inputCol='text', outputCol='sentences', pattern=pattern) sentences = tokenizer.transform(parsed).select('sentences') display(sentences) # COMMAND ---------- from pyspark.sql import Row from pyspark.sql.types import StructType, StructField, StringType sentenceRDD = (sentences .flatMap(lambda r: r[0]) .map(lambda x: Row(sentence=x))) sentenceSchema = StructType([StructField('sentence', StringType())]) sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema) display(sentence)