def main(argv): #se instancia el contexto de spark. sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae") #se inicia sesion en spark. spark = SparkSession(sc) #se guarda el lenguaje a partir del cual se quitaran las stop words. language = argv[4] #"spanish" #se guarda la ruta para la salida de los clusters. pathout = argv[3] #se guarda la ruta de la cual se leeran los archivos. path = argv[2] #"hdfs:///user/dhoyoso/datasets/dataset/" #se guarda el numero de clusters que se desea hacer. k = int(argv[1]) #4 #se sacan los archivos a procesar a partir de la ruta. files = sc.wholeTextFiles(path) #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto. schema = StructType([ StructField("path", StringType(), True), StructField("text", StringType(), True) ]) #se crea el dataframe a partir de la estructura y los archivos. df = spark.createDataFrame(files, schema) #se tokeniza el texto usando la clase de Ml tokenizer. tokenizer = Tokenizer(inputCol="text", outputCol="tokens") #se le dice al stop words remover que idioma es el que estamos tratando. StopWordsRemover.loadDefaultStopWords(language) #se remueven las stopwords de los tokens. stopWords = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") #se hace el hashing tf de los tokens restantes. hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2000) #se hace el idf de la salida del hashingTF idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1) #se inicializa el kmeans con el idf y el k deseado. kmeans = KMeans(k=k) #creacion del mapa de transformaciones. pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans]) #inserta el dataframe como el inicio de las transformaciones model = pipeline.fit(df) #ejecuta las trasformaciones mapeadas y guarda el resultado results = model.transform(df) results.cache() #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction). split_col = split(results['path'], '/') results = results.withColumn('docname', split_col.getItem(7)) df = results.select("docname", "prediction") #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json. grouped = df.groupBy(['prediction']).agg( collect_list("docname").alias('cluster_docs_list')) grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue( isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame( [Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
def get_pd_keyword(self): df_spark = self.df_spark # Step 1. Text cleasing with punctuations REGEX = '[_,?\\-.!?@#$%^&*+\/\d]' df_spark = df_spark.withColumn( "description_clean", regexp_replace(df_spark.description, REGEX, ' ')) # Step 2. Tokenization # df_spark = df_spark.drop("description_token") tokenizer = Tokenizer(inputCol='description_clean', outputCol='description_token') df_spark = tokenizer.transform(df_spark) # Stemming # nltk.download('wordnet') lemmatizer = WordNetLemmatizer() def lemm_function(list): list_clean = [] for item in list: list_clean.append(lemmatizer.lemmatize(item)) return list_clean udf_lemm_function = F.udf(lemm_function, ArrayType(StringType())) df_spark = df_spark.withColumn( "description_lemm", udf_lemm_function(df_spark.description_token)) # Step 3. Remove stopword stopwords_list = StopWordsRemover.loadDefaultStopWords("english") stopwords_customize_list = ["app", "apps"] stopwords_list = np.append(stopwords_list, stopwords_customize_list) stopwords = StopWordsRemover(inputCol="description_lemm", outputCol="description_no_stop", stopWords=stopwords_list) stopwords.getStopWords() df_spark = stopwords.transform(df_spark) df_pd_desc_final = df_spark.toPandas() # ### Note: IDF vector must be trained with large corpus, otherwise lose the advance of IDF # get the "description" column joinF = lambda x: " ".join(x) df_pd_desc_final["description_final"] = df_pd_desc_final[ "description_no_stop"].apply(joinF) corpus_list = df_pd_desc_final["description_final"].tolist() df_pd_desc_final = get_tfidf(corpus_list, df_pd_desc_final, self.topn) return df_pd_desc_final
def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
def A1(): #1) apply LDA and find topics in user's posts (including reposts) textToWords = RegexTokenizer( inputCol="text", outputCol="splitted", pattern="[\\P{L}]+") #Remove signs and split by spaces stopRemover = StopWordsRemover( inputCol="splitted", outputCol="words", stopWords=StopWordsRemover.loadDefaultStopWords("russian") + StopWordsRemover.loadDefaultStopWords("english")) countVectorizer = CountVectorizer(inputCol="words", outputCol="features") #Filter if post id exists? data = uWallP\ .filter( uWallP.text != "" )\ .select("id","text")\ .limit(10)\ pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer]) model = pipeline.fit(data) result = model.transform(data) corpus = result.select("id", "features").rdd.map( lambda r: [r.id, Vectors.fromML(r.features)]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=8, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.stages[2].vocabulary #CountVectorizer wordNumbers = 20 # number of words per topic topicIndices = spark.sparkContext.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordNumbers): term = vocabArray[terms[i]] result.append(term) return result topics_final = topicIndices.map( lambda topic: topic_render(topic)).collect() for topic in range(len(topics_final)): print("Topic" + str(topic) + ":") for term in topics_final[topic]: print(term) print('\n')
def getKeywordsInDataRange(sDF, oldestTime, newestTime, topics=1, wordsPerTopic=20): #yyyy-MM-dd #Filter oldestTime = datetime.strptime(oldestTime, '%Y-%m-%d') newestTime = datetime.strptime(newestTime, '%Y-%m-%d') filteredText = sDF\ .select( "id", date_format('day','yyyy-MM-dd').alias('time'), col("title").alias("text") )\ .where( (col("time") >= oldestTime) & (col("time") <= newestTime) ) #StartPipeline for preparing data textToWords = RegexTokenizer( inputCol="text", outputCol="splitted", pattern="[\\P{L}]+") #Remove signs and split by spaces stopRemover = StopWordsRemover( inputCol="splitted", outputCol="words", stopWords=StopWordsRemover.loadDefaultStopWords("english")) countVectorizer = CountVectorizer(inputCol="words", outputCol="features") pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer]) #GetCorups for LDA try: model = pipeline.fit(filteredText) except IllegalArgumentException: return [] result = model.transform(filteredText) corpus = result.select("id", "features").rdd.map( lambda r: [mhash(r.id) % 10**8, Vectors.fromML(r.features)]).cache() # Cluster the documents into k topics using LDA ldaModel = LDA.train(corpus, k=topics, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.stages[2].vocabulary #CountVectorizer topicIndices = spark.sparkContext.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordsPerTopic)) def topic_render(topic): # specify vector id of words to actual words terms = topic[0] result = [] for i in range(wordsPerTopic): term = vocabArray[terms[i]] result.append(term) return result # topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect() # for topic in range(len(topics_final)): # print ("Topic" + str(topic) + ":") # for term in topics_final[topic]: # print (term) # print ('\n') return topicIndices.map(lambda topic: topic_render(topic)).collect()
def clean(tokenized): englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stop_words = StopWordsRemover(outputCol="stopword_removed") \ .setStopWords(englishStopWords) \ .setInputCol("Review_Tokenized") SW_Re = stop_words.transform(tokenized) return SW_Re
def train(self): self.__prepare() spark = SparkSession\ .builder\ .appName("Kursach")\ .getOrCreate() input_file = spark.sparkContext.textFile('./w2v.txt') # print(input_file.collect()) prepared = input_file.map(lambda x: ([x])) df = prepared.toDF() prepared_df = df.selectExpr('_1 as text') tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(words) # print(stop_words) # filtered.show() # words.select('words').show(truncate=False, vertical=True) # filtered.select('filtered').show(truncate=False, vertical=True) vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() vocabulary = vectorizer.vocabulary # featurized_data.show() # featurized_data.select('raw_features').show(truncate=False, vertical=True) # print(vocabulary) idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) self.__word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='words', outputCol='result') self.__model = self.__word2Vec.fit(filtered) w2v_df = self.__model.transform(words) w2v_df.show() spark.stop()
def __init__(self, sc, configs): self.sqlContext = SQLContext(sc) self.spark_context = spark_context self.configs = configs self.path_to_task = self.configs['Data']['task'] self.undersample = self.configs['Training']['undersample'] self.task = Task(self.path_to_task) self.task_number = self.path_to_task[-1] self.split = self.configs['Data']['split'] self.training = self.task.get_split(self.split, part='train', chunks=10) _, self.labels, self.users = map(list, zip(*self.training)) self.posts = [post for user in self.users for post in user] self.posts = list(filter(lambda p: len(p.split()) > 15, self.posts)) self.labels, self.users = zip( *filter(lambda p: len(p[1]) > 10, zip(self.labels, self.users))) self.users = [' '.join(user) for user in self.users] if self.undersample != 'false': positives = list( filter(lambda s: s[0] == '1', zip(self.labels, self.users))) negatives = list( filter(lambda s: s[0] == '0', zip(self.labels, self.users))) shuffle(negatives) both = positives + negatives[:len(positives)] shuffle(both) self.labels, self.users = map(list, zip(*both)) self.tokenizer = Tokenizer(inputCol="text", outputCol="rawWords") self.stopWords = StopWordsRemover( inputCol="rawWords", outputCol="words", caseSensitive=False, stopWords=StopWordsRemover.loadDefaultStopWords("english")) self.cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=30000) self.idf = IDF(minDocFreq=2, inputCol="rawFeatures", outputCol="features") self.mlp = MultilayerPerceptronClassifier(maxIter=2000, layers=[30000, 80, 100, 2], blockSize=128, seed=1234) self.pipeline = Pipeline(stages=[ self.tokenizer, self.stopWords, self.cv, self.idf, self.mlp ]) self.model = self.pipeline.fit( self.create_data_frame(self.users, self.labels))
def main(inputs, output): # 1. Load Data and Select only business_id, stars, text data = spark.read.json(inputs, schema=review_schema).repartition(50).select( 'business_id', 'stars', 'text') data = data.where(data['text'].isNotNull()) # filter reviews with no text # 2. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words regex_tokenizer = RegexTokenizer(inputCol='text', outputCol='words', pattern='[^A-Za-z]+') stopwords_remover = StopWordsRemover( inputCol='words', outputCol='tokens', stopWords=StopWordsRemover.loadDefaultStopWords('english')) # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features') nlp_pipeline = Pipeline(stages=[regex_tokenizer, stopwords_remover]) model = nlp_pipeline.fit(data) review = model.transform(data).select('business_id', 'stars', 'tokens') # 3. Select Features review = review.select(review['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens')) review = review.where(functions.size(review['tokens']) > 0) review = review.withColumn('classify_tokens', udf_classify_tokens(review['tokens'])) # 4. Calculate Feature Weights review = review.withColumn('feature_weights', udf_senti_score(review['classify_tokens'])) review = review.withColumn('food', review['stars'] * review['feature_weights'][0]) review = review.withColumn('environment', review['stars'] * review['feature_weights'][1]) review = review.withColumn('service', review['stars'] * review['feature_weights'][2]) review = review.withColumn('price', review['stars'] * review['feature_weights'][3]) # 5. Calculate Average Feature Weights review_new = review.select('business_id', 'stars', 'food', 'environment', 'service', 'price') review_new = review_new.groupby('business_id').agg( functions.mean('stars').alias('ave_stars'), functions.mean('food').alias('food'), functions.mean('environment').alias('environment'), functions.mean('service').alias('service'), functions.mean('price').alias('price')) # 6. Save review_new.write.csv(output, mode='overwrite')
def transform(self): df2 = self.dataframe.withColumn( "_2", regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", "")) df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", "")) language_detect = udf(lambda x: detect(x), returnType=StringType()) df3 = df.withColumn("lang", language_detect('_2')) lemmatizer = Lemmatizer(lookup=delook) lemmatizer1 = Lemmatizer(lookup=enlook) tokenizer = Tokenizer(inputCol="_2", outputCol="words") tokenized = tokenizer.transform(df3) # print(tokenized) lemma = udf(lambda x, lang: True if lang == "de" " ".join([lemmatizer.lookup(i) for i in x]) else " ".join( [lemmatizer1.lookup(i) for i in x]), returnType=StringType()) lemmatized = tokenized.withColumn( "stemmed", lemma(col('words'), col('lang'))).drop('words').drop('_2') tokenizer = Tokenizer(inputCol="stemmed", outputCol="words") tokenized = tokenizer.transform(lemmatized) remover = StopWordsRemover(inputCol="words", outputCol="filtered") stopwords = remover.loadDefaultStopWords( "german") + remover.loadDefaultStopWords("english") remover = remover.setStopWords(stopwords) newDataSet = remover.transform(tokenized) test = newDataSet.withColumn("filtered", explode(col("filtered"))) \ .groupBy("_1", "filtered") \ .agg(func.count(func.lit(1)).alias("count")) \ .sort(col("count").desc()) return test
def __init__(self, data): self.tokenizer = Tokenizer(inputCol="text", outputCol="rawWords") self.stopWords = StopWordsRemover(inputCol="rawWords", outputCol="words", caseSensitive=False, stopWords=StopWordsRemover.loadDefaultStopWords("english")) self.cv = CountVectorizer(inputCol="words", outputCol="rawFeatures") self.idf = IDF(inputCol="rawFeatures", outputCol="features") svm = LinearSVC() pipeline = Pipeline(stages=[self.tokenizer, self.stopWords, self.cv, self.idf, svm]) self.model = pipeline.fit(data)
def main(*args): if len(args) != 2: print("Please provide one input and one output directories!") sys.exit(1) input_fn, output_fn = args[0],args[1] conf = SparkConf() conf.setAppName("grant") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Load the abstract content in the test folder into spark, # clean text, tokenize the corpus, and stem the words abstract = sc.textFile(input_fn) df_abs = (abstract.map(lambda doc: text_cleaning(doc)) .filter(lambda doc: len(doc) > 0) .filter(lambda line: not line.startswith('app')) .map(lambda doc: doc.split(' ')) .map(lambda word: [x for x in word if len(x)>0]) .map(lambda word: stem(word)) .map(lambda doc: (int(doc[0]), doc[1:])) .filter(lambda doc: len(doc[1])>0) .toDF(['Id','words'])) # build the pipeline and lda model with online optimizer stop_words = StopWordsRemover(inputCol='words', outputCol='clean') stop_words.setStopWords(stop_words.loadDefaultStopWords('english')) countv = CountVectorizer(inputCol=stop_words.getOutputCol(), outputCol="tokens") idf = IDF(inputCol=countv.getOutputCol(),outputCol="features") lda = LDA(maxIter=10,k=10,optimizer='online') pipeline = Pipeline(stages=[stop_words, countv, idf, lda]) lda_model = pipeline.fit(df_abs) labels = lda_model.transform(df_abs) # identify the label as the topic with the max probability # save the label to file topic_labels = (labels.select('Id','topicDistribution') .rdd .map(lambda x: (x[0],np.argmax(x[1]))) .saveAsTextFile(os.path.join(output_fn,'labels'))) # Get the topics wordnum = 5 # choose the number of topic words vocabulary = lda_model.stages[1].vocabulary voc_bv = sc.broadcast(vocabulary) topic_df = (lda_model.stages[3].describeTopics(wordnum) .rdd .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2])) .saveAsTextFile(os.path.join(output_fn,'words')))
def collect_stopwords(df, input_col="stemmed", number_of_words=100): top_words, less_then_3_charachters = words_widely_used_and_short( df, input_col, number_of_words) stopWordsNLTK = list(set(stopwords.words('english'))) + list( set(stopwords.words('italian'))) stopWordsCustom = [ " ", "", "dal", "al", "davan", "avev", "qualc", "qualcuno", "qualcosa", "avevano", "davanti", "aveva", "e", "avere", "fare", "la", "li", "lo", "gli", "essere", "solo", "per", "cosa", "ieri", "disponibile", "anno", "detto", "quando", "fatto", "sotto", "alcuna", "quali" ] #Add additional stopwords in th, is list stopWordsPySpark = StopWordsRemover.loadDefaultStopWords("italian") #Combine all the stopwords stpw = top_words + stopWordsNLTK + stopWordsCustom + stopWordsPySpark + less_then_3_charachters stem_stopw = myStemmer(stpw, True) #stemming the stopwords return (stpw + stem_stopw)
def init(): global data, regexTokenizer, stopwordsRemover, countVectors # Read the seed training data into the system data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load('seeddata.csv') # Contains a list of columns we don't care about drop_list = [] # Grab data in all columns that we care about data = data.select( [column for column in data.columns if column not in drop_list]) data.show(5) data.printSchema() # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="question", outputCol="words", pattern="\\W") # stop words add_stopwords = None # Load some default stopwords if add_stopwords == None: add_stopwords = StopWordsRemover.loadDefaultStopWords(language) # Remove certain stop words that provide context for our use case needed_stopwords = ['what', 'when', 'where', 'why'] for x in needed_stopwords: add_stopwords.remove(x) print("stop words:\n {}".format(add_stopwords)) stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=15, minDF=1)
def convertToVec(df, sc, ss, outputName, inputCol='tokens'): print('\n\n\n Removing Stopwords... \n\n\n') remover=StopWordsRemover(inputCol=inputCol, outputCol='nostops', stopWords=StopWordsRemover.loadDefaultStopWords('english')) df=remover.transform(df) cv=CountVectorizer(inputCol='nostops', outputCol='vectors',minTF=1.0) vecModel=cv.fit(df) new=False if new: print('\n\n\n Get Vocab... \n\n\n') inv_voc=vecModel.vocabulary f = codecs.open(outputName+'_vocab.txt', encoding='utf-8', mode='w') for item in inv_voc: f.write(u'{0}\n'.format(item)) f.close() vectors= vecModel.transform(df).select('id','subreddit','vectors') return vectors
def pre_process_data(df): df_collumn = df.withColumn( "text", regexp_replace(lower(df["text"]), "[$&+,:;=?@#|'<>.-^*()%!]", "")) df_without = df_collumn.withColumn( "text", regexp_replace(lower(df_collumn["text"]), "-", " ")) df_read = df_without.select('*').withColumn("id", monotonically_increasing_id()) # Tokenize data tokenizer = Tokenizer(inputCol="text", outputCol="words") df_tokenized = tokenizer.transform(df_read) #Remove Stop Words language = "portuguese" remover = StopWordsRemover( inputCol="words", outputCol="filtered", stopWords=StopWordsRemover.loadDefaultStopWords(language)) df_clean = remover.transform(df_tokenized) #Return dataframe return df_clean
def train_gensim(): from gensim.corpora import TextCorpus from gensim.corpora.textcorpus import lower_to_unicode from gensim.models import Word2Vec as GensimWord2Vec start = time() stopwords = [] if args.stop_word_lang: # starting spark only for this... spark = SparkSession.builder.appName("load stop words").getOrCreate() stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang) spark.sparkContext.stop() if args.stop_word_file: with open(args.stop_word_file) as stop_word_file: stopwords += [word.strip("\n") for word in stop_word_file.readlines()] def remove_stopwords(tokens): return [token for token in tokens if token not in stopwords] corpus = TextCorpus( args.txtPath, dictionary={None: None}, character_filters=[lower_to_unicode], token_filters=[remove_stopwords] ) model = GensimWord2Vec( seed=1, alpha=args.step_size, size=args.vector_size, window=args.window_size, sample=1e-6, sg=1 ) model.build_vocab(corpus.get_texts()) model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs) model.save(args.modelPath) end = time() print("Gensim training took {} seconds".format(end - start))
def train(self): self.__prepare() spark = SparkSession\ .builder\ .appName("Kursach")\ .getOrCreate() input_data = spark.sparkContext.textFile('./w2v.txt') prepared = input_data.map(lambda x: [x])\ .map(lambda x: (self.__remove_linebreaks(x[0]), '1'))\ .map(lambda x: (self.__remove_punctuation(x[0]), '1')) prepared_df = prepared.toDF().selectExpr('_1 as text') tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) filtered_words_data = words.rdd.map( lambda x: (x[0], self.__get_only_words(x[1]))) filtered_df = filtered_words_data.toDF().selectExpr( '_1 as text', '_2 as words') stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(filtered_df) self.__word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='filtered', outputCol='result') self.__model = self.__word2Vec.fit(filtered) w2v_df = self.__model.transform(filtered) w2v_df.show() spark.stop()
def build_model_pipeline(): """ TF (term frequency): number of times the word occurs in a sepcific document DF (document frequency): number of times a word coccurs in collection of documents TF-IDF (TF - inverse DF): measures the significace of a word in a document """ # 1. tokenize words, convert word to lowercase tokenizer = RegexTokenizer(inputCol='review', outputCol='review_tokens_uf', pattern='\\s+|[(),.!?\";]', toLowercase=True) # 2. remove stopwords stopwords_remover = StopWordsRemover( stopWords=StopWordsRemover.loadDefaultStopWords('english'), inputCol='review_tokens_uf', outputCol='review_tokens') # 3. TF # cv = CountVectorizer( # inputCol='review_tokens', # outputCol='tf', # vocabSize=200000 # ) cv = HashingTF(inputCol='review_tokens', outputCol='tf') # 4. IDF idf = IDF(inputCol='tf', outputCol='features') # 5. NB nb = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, stopwords_remover, cv, idf, nb]) return pipeline
def createFeats(spark, input, output, num_feat, _split=False, auto_feats=False): preproc_udf = udf(preprocess, StringType()) remove_udf = udf(remove_numbers_single_words, ArrayType(StringType())) lemmatize_udf = udf(lemmatize_words, ArrayType(StringType())) print("loading file") df = spark.read.format("csv").option("header", False) \ .option("delimiter", ",").option("inferSchema", True) \ .load(input) print("------------------------------------------------") # Remove urls, punctuation and set everything as lower case df = df.filter(df._c2.isNotNull()) df = df.withColumn("text", preproc_udf(df["_c2"])) df = df.filter(df.text.isNotNull()) # Tokenize words tokenizer = Tokenizer(inputCol="text", outputCol="raw_words") df = tokenizer.transform(df) # Lemmatize df = df.withColumn("words", lemmatize_udf(df["raw_words"])) df = df.drop("raw_words") df = df.filter(df.words.isNotNull()) df = df.filter(size(df.words) > 0) # Remove stopwords all_stopwords = StopWordsRemover.loadDefaultStopWords( "english") + StopWordsRemover.loadDefaultStopWords("italian") remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=all_stopwords) df = remover.transform(df) df = df.filter(size(df.filtered_words) > 0) # Remove words smaller that 5 letters and numbers df = df.withColumn("filtered_words_2", remove_udf(df["filtered_words"])) df = df.filter(size(df.filtered_words_2) > 0) # Automatically choose the number of features if auto_feats: num_feat = df.select("filtered_words_2").withColumn( "tokens", explode("filtered_words_2")).select("tokens").distinct().count() hashingTF = HashingTF(inputCol="filtered_words_2", outputCol="rawFeatures", numFeatures=num_feat) featurizedData = hashingTF.transform(df) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData = rescaledData.select("_c1", "filtered_words_2", "features") # Write the dataset to disk. Split it if needed. if _split: # Count the total rows of the file and generate # a shuffled version of the dataset. total_rows = rescaledData.count() shuffled_df = rescaledData.orderBy(rand(1)) # Generate dataset with this much rows. for s in [1000, 10000, 100000, 1000000]: if s <= total_rows: new_df = shuffled_df.limit(s) new_df.write.parquet(output + "/slice_" + str(s)) rescaledData.write.parquet(output + "/complete") else: rescaledData.write.parquet(output)
# COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setGaps(False)\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ---------- from pyspark.ml.feature import StopWordsRemover englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover()\ .setStopWords(englishStopWords)\ .setInputCol("DescOut") stops.transform(tokenized).show() # COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram().setInputCol("DescOut").setN(1) bigram = NGram().setInputCol("DescOut").setN(2) unigram.transform(tokenized.select("DescOut")).show(False) bigram.transform(tokenized.select("DescOut")).show(False)
def main(topic): # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id messages = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', topic)\ .option('failOnDataLoss', 'false')\ .option('auto.offset.reset', 'earliest')\ .load() values = messages.select(messages['value'].cast('string')) words = values.select( functions.explode(functions.split(values.value, ';')).alias("words")) data = words.withColumn('text', functions.split('words', ',')).select('text') data = data.withColumn('news_id', data['text'][0]) data = data.withColumn('news_keyword', data['text'][1]) data = data.withColumn('news_url', data['text'][2]) data = data.withColumn('tweet_url', data['text'][3]) data = data.withColumn('retweet_count', data['text'][4]) data = data.withColumn('favorite_count', data['text'][4]) # data = data.dropDuplicates(['tweet_url', 'news_url']) data = data.withColumn('favorite_count', data['favorite_count'].cast(types.IntegerType())) # data = data.groupby('tweet_id', 'news_keyword', 'tweet_url', 'news_url').max('favorite_count') # data = data.select('news_url', 'tweet_url').distinct() # data = data.groupby('news_url', 'tweet_url').agg( # functions.collect_set('news_keyword').alias('news_keywords') # ) #functions.collect_set('tweet_url').alias('tweet_urls') # udf_uuid = functions.udf(lambda: str(uuid.uuid4()), returnType=types.StringType()) # data = data.withColumn('news_id', udf_uuid()) # data = data.select('news_id', 'news_keyword', 'favorite_count', 'news_url', 'tweet_url') print('finish load data') # 2. Scrap the news_text and tweets_comments data = data.withColumn('tweets_infos', udf_get_comments(data['tweet_url'])) data = data.withColumn('tweets_info', functions.explode(data['tweets_infos'])) data = data.select('news_id', 'news_keyword', 'retweet_count', 'favorite_count', 'news_url', 'tweet_url', data.tweets_info[0].alias('like_counts'), data.tweets_info[1].alias('comment_time'), data.tweets_info[2].alias('tweets_comment')) data = data.withColumn('like_counts', data['like_counts'].cast(types.IntegerType())) # data = data.where(data['news_text'].isNotNull() & (functions.length(data['news_text']) > 0)) data = data.where(data['tweets_comment'].isNotNull() & (functions.length( data['tweets_comment']) > 0)) # filter reviews with no text print('finish scrap') # 3. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words data = data.withColumn('sentiment_score', udf_sentiment_score(data['tweets_comment'])) # news_regex_tokenizer = RegexTokenizer(inputCol='news_text', outputCol='news_words', pattern='[^A-Za-z]+') # news_stopwords_remover = StopWordsRemover(inputCol='news_words', # outputCol='news_tokens', # stopWords=StopWordsRemover.loadDefaultStopWords('english')) tweets_regex_tokenizer = RegexTokenizer(inputCol='tweets_comment', outputCol='tweets_words', pattern='[^A-Za-z]+') tweets_stopwords_remover = StopWordsRemover( inputCol='tweets_words', outputCol='tweets_tokens', stopWords=StopWordsRemover.loadDefaultStopWords('english')) # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features') nlp_pipeline = Pipeline( stages=[tweets_regex_tokenizer, tweets_stopwords_remover]) model = nlp_pipeline.fit(data) nlp_data = model.transform(data).select('news_id', 'news_keyword', 'retweet_count', 'favorite_count', 'news_url', 'tweet_url', 'tweets_comment', 'tweets_tokens', 'sentiment_score', 'like_counts', 'comment_time') # 4. Select Features # nlp_data = nlp_data.withColumn('news_tokens', udf_morphy(nlp_data['news_tokens'])) nlp_data = nlp_data.withColumn('tweets_tokens', udf_morphy(nlp_data['tweets_tokens'])) # nlp_data = nlp_data.select(nlp_data['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens')) # nlp_data = nlp_data.where(functions.size(nlp_data['news_tokens']) > 0) nlp_data = nlp_data.where(functions.size(nlp_data['tweets_tokens']) > 0) # 5. Calculate Weighted Sentiment Scores # nlp_data = nlp_data.withColumn('sentiment_score', udf_sentiment_score(nlp_data['tweets_tokens'])) nlp_data = nlp_data.withColumn('tweets_tokens', functions.concat_ws(' ', 'tweets_tokens')) # nlp_data = nlp_data.withColumn('classify_tokens', udf_classify_tokens(review['tokens'])) nlp_data_score = nlp_data.groupby( 'news_id', 'news_keyword', 'retweet_count', 'favorite_count', 'news_url', 'tweet_url').agg( functions.collect_list('tweets_tokens').alias('tweets_tokens'), functions.collect_list('sentiment_score').alias( 'sentiment_scores'), functions.collect_list('like_counts').alias('like_counts'), functions.collect_list('comment_time').alias('comment_time'), (functions.sum(nlp_data.sentiment_score * nlp_data.like_counts) / functions.sum( nlp_data.like_counts)).alias('weighted_sentiment_score')) nlp_data_score = nlp_data_score.withColumn( 'tweets_tokens', functions.concat_ws(' ', 'tweets_tokens')) nlp_data_score = nlp_data_score.withColumn( 'tweets_tokens', udf_classify_tokens(nlp_data_score['tweets_tokens'])) # nlp_data_score = nlp_data_score.withColumn('tweets_tokens', functions.split('tweets_tokens', '\s+')) # nlp_data_score = nlp_data_score.withColumn('news_tokens', functions.concat_ws(' ', 'news_tokens')) nlp_data_score = nlp_data_score.withColumn( 'comment_time', functions.concat_ws(',', 'comment_time')) print('finish scores') # 6. Save # nlp_data_score.write.format("com.mongodb.spark.sql.DefaultSource")\ # .mode("append")\ # .option("uri", "mongodb://127.0.0.1:27017/news.news_data")\ # .option("replaceDocument", False)\ # .option("database", "news")\ # .option("collection", "news_data").save() nlp_data_score = nlp_data_score.withColumn( 'dl_value', functions.to_json( functions.struct( [nlp_data_score[x] for x in nlp_data_score.columns]))) stream = nlp_data_score.select(nlp_data_score.news_id.alias("key"), nlp_data_score.dl_value.alias("value"))\ .writeStream\ .format('kafka')\ .outputMode('complete')\ .option('kafka.bootstrap.servers', 'localhost:9092')\ .option("topic", "nlp-2")\ .option("checkpointLocation", "../check")\ .start() # stream = nlp_data_score.writeStream.format('console').outputMode('complete').start() # stream = nlp_data_score.writeStream\ # .format('json')\ # .outputMode('update')\ # .option("path", "/Users/Dao/Documents/BigData/733/project/twitter/streaming/data")\ # .option("checkpointLocation", "../check")\ # .start() stream.awaitTermination()
# df = spark.sql('SELECT * FROM df WHERE df.text NOT CONTAINS "Kardashian" AND NOT CONTAINS "Jenner") #tokenize string print('Tokenizing Text...') tokenizer = Tokenizer(inputCol='text', outputCol='tokens') df = tokenizer.transform(df) wnl = WordNetLemmatizer() print('Lemmatizing Text...') lemma_udf = udf(lambda row: lemma(row), ArrayType(StringType())) df = df.withColumn('lemmed_tokens', lemma_udf(df.tokens)) # remove stopwords print('Removing Stop Words...') swr = StopWordsRemover(inputCol='lemmed_tokens', outputCol='filtered_tokens') stops = swr.loadDefaultStopWords('english') for stop in stops: stop.replace('’', '') for word in [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ha', 'wa', 'getty', 'image', 'ap', 'pictwittercom' ]: stops.append(word) swr.setStopWords(stops) df = swr.transform(df) df = df.select('post_id', 'filtered_tokens') print("Post Stop Word Remove") df.take(1)
"ham_spam").withColumnRenamed("_c1", "text") data = data.withColumn("cleantext", removePunctuationUDF(data.text)) #split data into training and test split_data = data.randomSplit([0.8, 0.2]) training_data = split_data[0] test_data = split_data[1] print("Training data: ", training_data.count()) print("Test data: ", test_data.count()) # COMMAND ---------- from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF stringIndexer = StringIndexer(inputCol="ham_spam", outputCol="label") tokenizer = Tokenizer(inputCol="cleantext", outputCol="words") add_stopwords = StopWordsRemover.loadDefaultStopWords('english') stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200) idf = IDF(inputCol="rawFeatures", outputCol="features") # COMMAND ---------- import shutil import os from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml import Pipeline, PipelineModel from azureml.core.run import Run
""" from pyspark.sql.functions import lit, rand data = true_df.withColumn('fake', lit(0)).union( fake_df.withColumn('fake', lit(1))).orderBy(rand()) # Check data data.groupBy('fake').count().show() # View concatenated result data.show(10) #%% """ 4.NLP Process """ from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF from pyspark.ml.feature import StringIndexer, VectorAssembler StopWordsRemover.loadDefaultStopWords('english') # 1.Tokenize the title, ignore emoji and etc. regular expression title_tokenizer = RegexTokenizer(inputCol='title', outputCol='title_words', pattern='\\W', toLowercase=True) # 2.Remove stopwords from title title_sw_remover = StopWordsRemover(inputCol='title_words', outputCol='title_sw_removed') # 3.Compute Term frequency from title title_count_vectorizer = CountVectorizer(inputCol='title_sw_removed', outputCol='tf_title')
# COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setGaps(False)\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ---------- from pyspark.ml.feature import StopWordsRemover englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover()\ .setStopWords(englishStopWords)\ .setInputCol("DescOut") stops.transform(tokenized).show() # COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram(n=1, inputCol="DescOut", outputCol="unigrams") unigramDataFrame = unigram.transform(tokenized) unigramDataFrame.select("unigrams").show(truncate=False) # COMMAND ----------
type=int) args = parser.parse_args() spark = SparkSession.builder \ .appName("get frequent words") \ .config("spark.sql.catalogImplementation", "in-memory") \ .getOrCreate() sc = spark.sparkContext if args.stop_word_lang or args.stop_word_file: sentences = sc.textFile( args.txtPath).map(lambda row: Row(sentence_raw=row.split(" "))).toDF() stopWords = [] if args.stop_word_lang: stopWords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang) if args.stop_word_file: stopWords += sc.textFile(args.stop_word_file).collect() remover = StopWordsRemover(inputCol="sentence_raw", outputCol="sentence", stopWords=stopWords) sentences = remover.transform(sentences) else: sentences = sc.textFile( args.txtPath).map(lambda row: Row(sentence=row.split(" "))).toDF() words = sentences.rdd.map(lambda row: row.sentence).flatMap(lambda x: x) wordCounts = words.map(lambda w: (w, 1)) \ .reduceByKey(add) \ .filter(lambda w: w[1] >= args.min_count) \
#Check if all the params were passed if (len(sys.argv) > 5): #Setup the sparkContext sc = SparkContext(appName="SparkClustering-emonto15-dperezg1") spark = SparkSession(sc) #Read from hdfs and save using a schema (path,text) files = sc.wholeTextFiles("hdfs://" + sys.argv[1]) schema = StructType([ StructField("path", StringType(), True), StructField("text", StringType(), True) ]) df = spark.createDataFrame(files, schema) #Divide the text into an array of words tokenizer = Tokenizer(inputCol="text", outputCol="tokens") #Setup the language to remove the stopwords StopWordsRemover.loadDefaultStopWords(sys.argv[4]) #Read from column tokens (which is the output of the tokenizer object) and save a new array of words without the stopwords stopWords = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") #Creates a hash of each word and the frecuency on each document and only takes the number of words established on the numFeatures parameter hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=int(sys.argv[3])) #Calculates the inverse document frecuency, and ignore a word if well explained on the code's article idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1) #Initialize the kmeans with a specific K kmeans = KMeans(k=int(sys.argv[2])) #Declare the assambly line to transform the dataset #creacion del mapa de transformaciones pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans]) #Apply the assambly line to the dataset
winz = 5 word_nsamps = 10 rm_stop = True language = "spanish" # Added the jar driver to the $SPARK_HOME/jars directory: # Downloaded from: https://bitbucket.org/xerial/sqlite-jdbc/downloads/sqlite-jdbc-3.8.6.jar spark = SparkSession.builder.getOrCreate() df = spark.read.text(input_txt).select(removePunctuation(F.col('value'))) tokenizer = Tokenizer(inputCol="sentence", outputCol="toks" if rm_stop else "tokens") df = tokenizer.transform(df) if rm_stop: remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="tokens", stopWords=None if language == "english" else StopWordsRemover.loadDefaultStopWords(language)) df = remover.transform(df) # Now the magic of windowing the text with F.explode() win = windowing(winz) decompose = win.get_udf() df = df.withColumn("slides", decompose("tokens")) \ .withColumn("exploded", F.explode("slides")) \ .withColumn("word", get_mid("exploded")) \ .withColumn("window", rm_mid("exploded")) df = df.drop(*[c for c in df.columns if not c in ["word", "window"]]) indexer = StringIndexer(inputCol="word", outputCol="label") df = indexer.fit(df).transform(df) #.persist(StorageLevel.DISK_ONLY)#MEMORY_AND_DISK)
def create_w2v_model(): spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .config("spark.executor.memory", "2g") \ .config("spark.driver.memory", "2g") \ .config("spark.memory.offHeap.enabled", True) \ .config("spark.memory.offHeap.size", "2g") \ .getOrCreate() input_file = spark.sparkContext.wholeTextFiles(PATH) print(""" Подготовка данных (1)... """) prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1]))) print(""" Подготовка данных (2)... """) df = prepared_data.toDF() print(""" Подготовка данных (3)... """) prepared_df = df.selectExpr('_2 as text') print(""" Разбитие на токены... """) tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) print(""" Очистка от стоп-слов... """) stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words) print(""" Построение модели... """) word2Vec = Word2Vec(vectorSize=50, inputCol='words', outputCol='result', minCount=2) model = word2Vec.fit(words) print(""" Сохранение модели... """) today = datetime.datetime.today() model_name = today.strftime("model/kurs_model") print(""" Model """ + model_name + """ saved """) model.save(model_name) spark.stop()
df = df.withColumn('text', f.regexp_replace('text', 'http\S+\s*', '')) df = df.withColumn('text', f.regexp_replace('text', 'RT|cc', '')) df = df.withColumn('text', f.regexp_replace('text', '@\S+', '')) geonames = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load( 's3://emrtestticksnl/TestRic/NL_geonames_triGram_townprovinces.csv' ).select(f.lower(f.col('asciiname')), 'latitude', 'longitude') geonames = geonames.withColumnRenamed("lower(asciiname)", "placename") geonames = geonames.dropDuplicates(['placename']) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") # stop words dutchwords = StopWordsRemover.loadDefaultStopWords('dutch') englishwords = StopWordsRemover.loadDefaultStopWords('english') add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "co", "@" ] + dutchwords + englishwords stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover]) # Fit the pipeline to training documents. pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) ########Stemmer definition dataset1 = dataset.select("filtered")