Пример #1
0
def main(argv):

    #se instancia el contexto de spark.
    sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae")
    #se inicia sesion en spark.
    spark = SparkSession(sc)
    #se guarda el lenguaje a partir del cual se quitaran las stop words.
    language = argv[4]  #"spanish"
    #se guarda la ruta para la salida de los clusters.
    pathout = argv[3]
    #se guarda la ruta de la cual se leeran los archivos.
    path = argv[2]  #"hdfs:///user/dhoyoso/datasets/dataset/"
    #se guarda el numero de clusters que se desea hacer.
    k = int(argv[1])  #4
    #se sacan los archivos a procesar a partir de la ruta.
    files = sc.wholeTextFiles(path)
    #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto.
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    #se crea el dataframe a partir de la estructura y los archivos.
    df = spark.createDataFrame(files, schema)
    #se tokeniza el texto usando la clase de Ml tokenizer.
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #se le dice al stop words remover que idioma es el que estamos tratando.
    StopWordsRemover.loadDefaultStopWords(language)
    #se remueven las stopwords de los tokens.
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #se hace el hashing tf de los tokens restantes.
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2000)
    #se hace el idf de la salida del hashingTF
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
    #se inicializa el kmeans con el idf y el k deseado.
    kmeans = KMeans(k=k)
    #creacion del mapa de transformaciones.
    pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans])
    #inserta el dataframe como el inicio de las transformaciones
    model = pipeline.fit(df)
    #ejecuta las trasformaciones mapeadas y guarda el resultado
    results = model.transform(df)
    results.cache()
    #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction).
    split_col = split(results['path'], '/')
    results = results.withColumn('docname', split_col.getItem(7))
    df = results.select("docname", "prediction")

    #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json.
    grouped = df.groupBy(['prediction']).agg(
        collect_list("docname").alias('cluster_docs_list'))
    grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
Пример #2
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input",
                                        outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(
         isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame(
         [Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Пример #3
0
    def get_pd_keyword(self):

        df_spark = self.df_spark

        # Step 1. Text cleasing with punctuations

        REGEX = '[_,?\\-.!?@#$%^&*+\/\d]'
        df_spark = df_spark.withColumn(
            "description_clean",
            regexp_replace(df_spark.description, REGEX, ' '))

        # Step 2. Tokenization
        # df_spark = df_spark.drop("description_token")

        tokenizer = Tokenizer(inputCol='description_clean',
                              outputCol='description_token')
        df_spark = tokenizer.transform(df_spark)

        # Stemming
        # nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()

        def lemm_function(list):
            list_clean = []
            for item in list:
                list_clean.append(lemmatizer.lemmatize(item))

            return list_clean

        udf_lemm_function = F.udf(lemm_function, ArrayType(StringType()))

        df_spark = df_spark.withColumn(
            "description_lemm", udf_lemm_function(df_spark.description_token))

        # Step 3. Remove stopword

        stopwords_list = StopWordsRemover.loadDefaultStopWords("english")
        stopwords_customize_list = ["app", "apps"]
        stopwords_list = np.append(stopwords_list, stopwords_customize_list)

        stopwords = StopWordsRemover(inputCol="description_lemm",
                                     outputCol="description_no_stop",
                                     stopWords=stopwords_list)
        stopwords.getStopWords()
        df_spark = stopwords.transform(df_spark)

        df_pd_desc_final = df_spark.toPandas()

        # ### Note: IDF vector must be trained with large corpus, otherwise lose the advance of IDF

        # get the "description" column
        joinF = lambda x: " ".join(x)
        df_pd_desc_final["description_final"] = df_pd_desc_final[
            "description_no_stop"].apply(joinF)

        corpus_list = df_pd_desc_final["description_final"].tolist()

        df_pd_desc_final = get_tfidf(corpus_list, df_pd_desc_final, self.topn)

        return df_pd_desc_final
Пример #4
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Пример #5
0
def A1():  #1) apply LDA and find topics in user's posts (including reposts)
    textToWords = RegexTokenizer(
        inputCol="text", outputCol="splitted",
        pattern="[\\P{L}]+")  #Remove signs and split by spaces
    stopRemover = StopWordsRemover(
        inputCol="splitted",
        outputCol="words",
        stopWords=StopWordsRemover.loadDefaultStopWords("russian") +
        StopWordsRemover.loadDefaultStopWords("english"))
    countVectorizer = CountVectorizer(inputCol="words", outputCol="features")

    #Filter if post id exists?
    data = uWallP\
        .filter( uWallP.text != "" )\
        .select("id","text")\
        .limit(10)\

    pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer])
    model = pipeline.fit(data)
    result = model.transform(data)
    corpus = result.select("id", "features").rdd.map(
        lambda r: [r.id, Vectors.fromML(r.features)]).cache()

    # Cluster the documents into k topics using LDA
    ldaModel = LDA.train(corpus, k=8, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.stages[2].vocabulary  #CountVectorizer
    wordNumbers = 20  # number of words per topic
    topicIndices = spark.sparkContext.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordNumbers):
            term = vocabArray[terms[i]]
            result.append(term)
        return result

    topics_final = topicIndices.map(
        lambda topic: topic_render(topic)).collect()

    for topic in range(len(topics_final)):
        print("Topic" + str(topic) + ":")
        for term in topics_final[topic]:
            print(term)
        print('\n')
Пример #6
0
def getKeywordsInDataRange(sDF,
                           oldestTime,
                           newestTime,
                           topics=1,
                           wordsPerTopic=20):  #yyyy-MM-dd
    #Filter
    oldestTime = datetime.strptime(oldestTime, '%Y-%m-%d')
    newestTime = datetime.strptime(newestTime, '%Y-%m-%d')

    filteredText = sDF\
                    .select( "id", date_format('day','yyyy-MM-dd').alias('time'), col("title").alias("text") )\
                    .where( (col("time") >= oldestTime) & (col("time") <= newestTime) )

    #StartPipeline for preparing data
    textToWords = RegexTokenizer(
        inputCol="text", outputCol="splitted",
        pattern="[\\P{L}]+")  #Remove signs and split by spaces
    stopRemover = StopWordsRemover(
        inputCol="splitted",
        outputCol="words",
        stopWords=StopWordsRemover.loadDefaultStopWords("english"))
    countVectorizer = CountVectorizer(inputCol="words", outputCol="features")
    pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer])

    #GetCorups for LDA
    try:
        model = pipeline.fit(filteredText)
    except IllegalArgumentException:
        return []
    result = model.transform(filteredText)
    corpus = result.select("id", "features").rdd.map(
        lambda r: [mhash(r.id) % 10**8,
                   Vectors.fromML(r.features)]).cache()

    # Cluster the documents into k topics using LDA
    ldaModel = LDA.train(corpus,
                         k=topics,
                         maxIterations=100,
                         optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.stages[2].vocabulary  #CountVectorizer
    topicIndices = spark.sparkContext.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordsPerTopic))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordsPerTopic):
            term = vocabArray[terms[i]]
            result.append(term)
        return result

    # topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
    # for topic in range(len(topics_final)):
    #     print ("Topic" + str(topic) + ":")
    #     for term in topics_final[topic]:
    #         print (term)
    #     print ('\n')
    return topicIndices.map(lambda topic: topic_render(topic)).collect()
Пример #7
0
    def clean(tokenized):
        englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
        stop_words = StopWordsRemover(outputCol="stopword_removed") \
            .setStopWords(englishStopWords) \
            .setInputCol("Review_Tokenized")
        SW_Re = stop_words.transform(tokenized)

        return SW_Re
Пример #8
0
    def train(self):

        self.__prepare()

        spark = SparkSession\
            .builder\
            .appName("Kursach")\
            .getOrCreate()

        input_file = spark.sparkContext.textFile('./w2v.txt')

        # print(input_file.collect())
        prepared = input_file.map(lambda x: ([x]))
        df = prepared.toDF()
        prepared_df = df.selectExpr('_1 as text')

        tokenizer = Tokenizer(inputCol='text', outputCol='words')
        words = tokenizer.transform(prepared_df)

        stop_words = StopWordsRemover.loadDefaultStopWords('russian')
        remover = StopWordsRemover(inputCol='words',
                                   outputCol='filtered',
                                   stopWords=stop_words)
        filtered = remover.transform(words)

        # print(stop_words)

        # filtered.show()

        # words.select('words').show(truncate=False, vertical=True)

        # filtered.select('filtered').show(truncate=False, vertical=True)

        vectorizer = CountVectorizer(inputCol='filtered',
                                     outputCol='raw_features').fit(filtered)
        featurized_data = vectorizer.transform(filtered)
        featurized_data.cache()
        vocabulary = vectorizer.vocabulary

        # featurized_data.show()

        # featurized_data.select('raw_features').show(truncate=False, vertical=True)

        # print(vocabulary)

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(featurized_data)
        rescaled_data = idf_model.transform(featurized_data)

        self.__word2Vec = Word2Vec(vectorSize=3,
                                   minCount=0,
                                   inputCol='words',
                                   outputCol='result')
        self.__model = self.__word2Vec.fit(filtered)
        w2v_df = self.__model.transform(words)
        w2v_df.show()
        spark.stop()
    def __init__(self, sc, configs):
        self.sqlContext = SQLContext(sc)

        self.spark_context = spark_context

        self.configs = configs
        self.path_to_task = self.configs['Data']['task']
        self.undersample = self.configs['Training']['undersample']

        self.task = Task(self.path_to_task)
        self.task_number = self.path_to_task[-1]
        self.split = self.configs['Data']['split']
        self.training = self.task.get_split(self.split,
                                            part='train',
                                            chunks=10)
        _, self.labels, self.users = map(list, zip(*self.training))
        self.posts = [post for user in self.users for post in user]
        self.posts = list(filter(lambda p: len(p.split()) > 15, self.posts))
        self.labels, self.users = zip(
            *filter(lambda p: len(p[1]) > 10, zip(self.labels, self.users)))
        self.users = [' '.join(user) for user in self.users]

        if self.undersample != 'false':
            positives = list(
                filter(lambda s: s[0] == '1', zip(self.labels, self.users)))
            negatives = list(
                filter(lambda s: s[0] == '0', zip(self.labels, self.users)))
            shuffle(negatives)
            both = positives + negatives[:len(positives)]
            shuffle(both)
            self.labels, self.users = map(list, zip(*both))

        self.tokenizer = Tokenizer(inputCol="text", outputCol="rawWords")
        self.stopWords = StopWordsRemover(
            inputCol="rawWords",
            outputCol="words",
            caseSensitive=False,
            stopWords=StopWordsRemover.loadDefaultStopWords("english"))
        self.cv = CountVectorizer(inputCol="words",
                                  outputCol="rawFeatures",
                                  vocabSize=30000)
        self.idf = IDF(minDocFreq=2,
                       inputCol="rawFeatures",
                       outputCol="features")
        self.mlp = MultilayerPerceptronClassifier(maxIter=2000,
                                                  layers=[30000, 80, 100, 2],
                                                  blockSize=128,
                                                  seed=1234)

        self.pipeline = Pipeline(stages=[
            self.tokenizer, self.stopWords, self.cv, self.idf, self.mlp
        ])

        self.model = self.pipeline.fit(
            self.create_data_frame(self.users, self.labels))
Пример #10
0
def main(inputs, output):
    # 1. Load Data and Select only business_id, stars, text
    data = spark.read.json(inputs,
                           schema=review_schema).repartition(50).select(
                               'business_id', 'stars', 'text')
    data = data.where(data['text'].isNotNull())  # filter reviews with no text

    # 2. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words
    regex_tokenizer = RegexTokenizer(inputCol='text',
                                     outputCol='words',
                                     pattern='[^A-Za-z]+')
    stopwords_remover = StopWordsRemover(
        inputCol='words',
        outputCol='tokens',
        stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
    nlp_pipeline = Pipeline(stages=[regex_tokenizer, stopwords_remover])
    model = nlp_pipeline.fit(data)
    review = model.transform(data).select('business_id', 'stars', 'tokens')

    # 3. Select Features
    review = review.select(review['business_id'], review['stars'],
                           udf_morphy(review['tokens']).alias('tokens'))
    review = review.where(functions.size(review['tokens']) > 0)
    review = review.withColumn('classify_tokens',
                               udf_classify_tokens(review['tokens']))

    # 4. Calculate Feature Weights
    review = review.withColumn('feature_weights',
                               udf_senti_score(review['classify_tokens']))
    review = review.withColumn('food',
                               review['stars'] * review['feature_weights'][0])
    review = review.withColumn('environment',
                               review['stars'] * review['feature_weights'][1])
    review = review.withColumn('service',
                               review['stars'] * review['feature_weights'][2])
    review = review.withColumn('price',
                               review['stars'] * review['feature_weights'][3])

    # 5. Calculate Average Feature Weights
    review_new = review.select('business_id', 'stars', 'food', 'environment',
                               'service', 'price')
    review_new = review_new.groupby('business_id').agg(
        functions.mean('stars').alias('ave_stars'),
        functions.mean('food').alias('food'),
        functions.mean('environment').alias('environment'),
        functions.mean('service').alias('service'),
        functions.mean('price').alias('price'))

    # 6. Save
    review_new.write.csv(output, mode='overwrite')
Пример #11
0
    def transform(self):
        df2 = self.dataframe.withColumn(
            "_2",
            regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", ""))
        df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", ""))

        language_detect = udf(lambda x: detect(x), returnType=StringType())
        df3 = df.withColumn("lang", language_detect('_2'))

        lemmatizer = Lemmatizer(lookup=delook)
        lemmatizer1 = Lemmatizer(lookup=enlook)
        tokenizer = Tokenizer(inputCol="_2", outputCol="words")
        tokenized = tokenizer.transform(df3)
        # print(tokenized)

        lemma = udf(lambda x, lang: True if lang == "de"
                    " ".join([lemmatizer.lookup(i) for i in x]) else " ".join(
                        [lemmatizer1.lookup(i) for i in x]),
                    returnType=StringType())

        lemmatized = tokenized.withColumn(
            "stemmed", lemma(col('words'),
                             col('lang'))).drop('words').drop('_2')
        tokenizer = Tokenizer(inputCol="stemmed", outputCol="words")
        tokenized = tokenizer.transform(lemmatized)
        remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        stopwords = remover.loadDefaultStopWords(
            "german") + remover.loadDefaultStopWords("english")
        remover = remover.setStopWords(stopwords)
        newDataSet = remover.transform(tokenized)

        test = newDataSet.withColumn("filtered", explode(col("filtered"))) \
            .groupBy("_1", "filtered") \
            .agg(func.count(func.lit(1)).alias("count")) \
            .sort(col("count").desc())

        return test
Пример #12
0
    def __init__(self, data):
        self.tokenizer = Tokenizer(inputCol="text", outputCol="rawWords")

        self.stopWords = StopWordsRemover(inputCol="rawWords", outputCol="words", caseSensitive=False,
                                          stopWords=StopWordsRemover.loadDefaultStopWords("english"))

        self.cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")

        self.idf = IDF(inputCol="rawFeatures", outputCol="features")

        svm = LinearSVC()

        pipeline = Pipeline(stages=[self.tokenizer, self.stopWords, self.cv, self.idf, svm])

        self.model = pipeline.fit(data)
Пример #13
0
def main(*args):
    if len(args) != 2:
        print("Please provide one input and one output directories!")
        sys.exit(1)

    input_fn, output_fn = args[0],args[1]
    conf = SparkConf()
    conf.setAppName("grant")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # Load the abstract content in the test folder into spark, 
    # clean text, tokenize the corpus, and stem the words
    abstract = sc.textFile(input_fn)
    df_abs = (abstract.map(lambda doc: text_cleaning(doc))
                      .filter(lambda doc: len(doc) > 0)
                      .filter(lambda line: not line.startswith('app'))
                      .map(lambda doc: doc.split(' '))
                      .map(lambda word: [x for x in word if len(x)>0])
                      .map(lambda word: stem(word))
                      .map(lambda doc: (int(doc[0]), doc[1:]))
                      .filter(lambda doc: len(doc[1])>0)
                      .toDF(['Id','words']))
    # build the pipeline and lda model with online optimizer
    stop_words = StopWordsRemover(inputCol='words',
                             outputCol='clean')
    stop_words.setStopWords(stop_words.loadDefaultStopWords('english'))
    countv = CountVectorizer(inputCol=stop_words.getOutputCol(), 
                             outputCol="tokens")
    idf = IDF(inputCol=countv.getOutputCol(),outputCol="features")
    lda = LDA(maxIter=10,k=10,optimizer='online')
    pipeline = Pipeline(stages=[stop_words, countv, idf, lda])
    lda_model = pipeline.fit(df_abs)
    labels = lda_model.transform(df_abs)
    
    # identify the label as the topic with the max probability
    # save the label to file
    topic_labels = (labels.select('Id','topicDistribution')
                          .rdd
                          .map(lambda x: (x[0],np.argmax(x[1])))
                          .saveAsTextFile(os.path.join(output_fn,'labels')))
    # Get the topics
    wordnum = 5 # choose the number of topic words
    vocabulary = lda_model.stages[1].vocabulary
    voc_bv = sc.broadcast(vocabulary)
    topic_df = (lda_model.stages[3].describeTopics(wordnum)
                     .rdd
                     .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2]))
                     .saveAsTextFile(os.path.join(output_fn,'words')))
Пример #14
0
def collect_stopwords(df, input_col="stemmed", number_of_words=100):
    top_words, less_then_3_charachters = words_widely_used_and_short(
        df, input_col, number_of_words)
    stopWordsNLTK = list(set(stopwords.words('english'))) + list(
        set(stopwords.words('italian')))
    stopWordsCustom = [
        " ", "", "dal", "al", "davan", "avev", "qualc", "qualcuno", "qualcosa",
        "avevano", "davanti", "aveva", "e", "avere", "fare", "la", "li", "lo",
        "gli", "essere", "solo", "per", "cosa", "ieri", "disponibile", "anno",
        "detto", "quando", "fatto", "sotto", "alcuna", "quali"
    ]
    #Add additional stopwords in th, is list
    stopWordsPySpark = StopWordsRemover.loadDefaultStopWords("italian")
    #Combine all the stopwords
    stpw = top_words + stopWordsNLTK + stopWordsCustom + stopWordsPySpark + less_then_3_charachters
    stem_stopw = myStemmer(stpw, True)  #stemming the stopwords
    return (stpw + stem_stopw)
Пример #15
0
def init():

    global data, regexTokenizer, stopwordsRemover, countVectors

    # Read the seed training data into the system

    data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load('seeddata.csv')

    # Contains a list of columns we don't care about

    drop_list = []

    # Grab data in all columns that we care about
    data = data.select(
        [column for column in data.columns if column not in drop_list])

    data.show(5)
    data.printSchema()

    # regular expression tokenizer
    regexTokenizer = RegexTokenizer(inputCol="question",
                                    outputCol="words",
                                    pattern="\\W")

    # stop words
    add_stopwords = None

    # Load some default stopwords
    if add_stopwords == None:
        add_stopwords = StopWordsRemover.loadDefaultStopWords(language)

        # Remove certain stop words that provide context for our use case
        needed_stopwords = ['what', 'when', 'where', 'why']
        for x in needed_stopwords:
            add_stopwords.remove(x)
        print("stop words:\n {}".format(add_stopwords))

    stopwordsRemover = StopWordsRemover(
        inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

    # bag of words count
    countVectors = CountVectorizer(inputCol="filtered",
                                   outputCol="features",
                                   vocabSize=15,
                                   minDF=1)
Пример #16
0
def convertToVec(df, sc, ss, outputName, inputCol='tokens'):
    print('\n\n\n Removing Stopwords... \n\n\n')
    remover=StopWordsRemover(inputCol=inputCol, outputCol='nostops', stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    df=remover.transform(df)

    cv=CountVectorizer(inputCol='nostops', outputCol='vectors',minTF=1.0)
    vecModel=cv.fit(df)
    new=False
    if new:
        print('\n\n\n Get Vocab... \n\n\n')
        inv_voc=vecModel.vocabulary 
        f = codecs.open(outputName+'_vocab.txt', encoding='utf-8', mode='w')
        for item in inv_voc:
            f.write(u'{0}\n'.format(item))
        f.close()
    vectors= vecModel.transform(df).select('id','subreddit','vectors')
    return vectors
Пример #17
0
def pre_process_data(df):
    df_collumn = df.withColumn(
        "text",
        regexp_replace(lower(df["text"]), "[$&+,:;=?@#|'<>.-^*()%!]", ""))
    df_without = df_collumn.withColumn(
        "text", regexp_replace(lower(df_collumn["text"]), "-", " "))
    df_read = df_without.select('*').withColumn("id",
                                                monotonically_increasing_id())
    # Tokenize data
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    df_tokenized = tokenizer.transform(df_read)
    #Remove Stop Words
    language = "portuguese"
    remover = StopWordsRemover(
        inputCol="words",
        outputCol="filtered",
        stopWords=StopWordsRemover.loadDefaultStopWords(language))
    df_clean = remover.transform(df_tokenized)
    #Return dataframe
    return df_clean
Пример #18
0
def train_gensim():
	from gensim.corpora import TextCorpus
	from gensim.corpora.textcorpus import lower_to_unicode
	from gensim.models import Word2Vec as GensimWord2Vec

	start = time()

	stopwords = []
	if args.stop_word_lang:
		# starting spark only for this...
		spark = SparkSession.builder.appName("load stop words").getOrCreate()
		stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang)
		spark.sparkContext.stop()
	if args.stop_word_file:
		with open(args.stop_word_file) as stop_word_file:
			stopwords += [word.strip("\n") for word in stop_word_file.readlines()]

	def remove_stopwords(tokens):
		return [token for token in tokens if token not in stopwords]

	corpus = TextCorpus(
		args.txtPath,
		dictionary={None: None},
		character_filters=[lower_to_unicode],
		token_filters=[remove_stopwords]
	)

	model = GensimWord2Vec(
		seed=1,
		alpha=args.step_size,
		size=args.vector_size,
		window=args.window_size,
		sample=1e-6,
        sg=1
	)
	model.build_vocab(corpus.get_texts())
	model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs)
	model.save(args.modelPath)

	end = time()
	print("Gensim training took {} seconds".format(end - start))
Пример #19
0
    def train(self):

        self.__prepare()

        spark = SparkSession\
            .builder\
            .appName("Kursach")\
            .getOrCreate()

        input_data = spark.sparkContext.textFile('./w2v.txt')

        prepared = input_data.map(lambda x: [x])\
            .map(lambda x: (self.__remove_linebreaks(x[0]), '1'))\
            .map(lambda x: (self.__remove_punctuation(x[0]), '1'))
        prepared_df = prepared.toDF().selectExpr('_1 as text')

        tokenizer = Tokenizer(inputCol='text', outputCol='words')
        words = tokenizer.transform(prepared_df)

        filtered_words_data = words.rdd.map(
            lambda x: (x[0], self.__get_only_words(x[1])))
        filtered_df = filtered_words_data.toDF().selectExpr(
            '_1 as text', '_2 as words')

        stop_words = StopWordsRemover.loadDefaultStopWords('russian')
        remover = StopWordsRemover(inputCol='words',
                                   outputCol='filtered',
                                   stopWords=stop_words)
        filtered = remover.transform(filtered_df)

        self.__word2Vec = Word2Vec(vectorSize=3,
                                   minCount=0,
                                   inputCol='filtered',
                                   outputCol='result')
        self.__model = self.__word2Vec.fit(filtered)
        w2v_df = self.__model.transform(filtered)
        w2v_df.show()
        spark.stop()
Пример #20
0
def build_model_pipeline():
    """
    TF (term frequency): number of times the word occurs in a sepcific document
    DF (document frequency): number of times a word coccurs in collection of documents
    TF-IDF (TF - inverse DF): measures the significace of a word in a document
    """

    # 1. tokenize words, convert word to lowercase
    tokenizer = RegexTokenizer(inputCol='review',
                               outputCol='review_tokens_uf',
                               pattern='\\s+|[(),.!?\";]',
                               toLowercase=True)

    # 2. remove stopwords
    stopwords_remover = StopWordsRemover(
        stopWords=StopWordsRemover.loadDefaultStopWords('english'),
        inputCol='review_tokens_uf',
        outputCol='review_tokens')

    # 3. TF
    # cv = CountVectorizer(
    #     inputCol='review_tokens',
    #     outputCol='tf',
    #     vocabSize=200000
    # )
    cv = HashingTF(inputCol='review_tokens', outputCol='tf')

    # 4. IDF
    idf = IDF(inputCol='tf', outputCol='features')

    # 5. NB
    nb = NaiveBayes()

    pipeline = Pipeline(stages=[tokenizer, stopwords_remover, cv, idf, nb])

    return pipeline
Пример #21
0
def createFeats(spark,
                input,
                output,
                num_feat,
                _split=False,
                auto_feats=False):
    preproc_udf = udf(preprocess, StringType())
    remove_udf = udf(remove_numbers_single_words, ArrayType(StringType()))
    lemmatize_udf = udf(lemmatize_words, ArrayType(StringType()))

    print("loading file")
    df = spark.read.format("csv").option("header", False) \
        .option("delimiter", ",").option("inferSchema", True) \
        .load(input)

    print("------------------------------------------------")

    # Remove urls, punctuation and set everything as lower case
    df = df.filter(df._c2.isNotNull())
    df = df.withColumn("text", preproc_udf(df["_c2"]))
    df = df.filter(df.text.isNotNull())

    # Tokenize words
    tokenizer = Tokenizer(inputCol="text", outputCol="raw_words")
    df = tokenizer.transform(df)

    # Lemmatize
    df = df.withColumn("words", lemmatize_udf(df["raw_words"]))
    df = df.drop("raw_words")
    df = df.filter(df.words.isNotNull())
    df = df.filter(size(df.words) > 0)

    # Remove stopwords
    all_stopwords = StopWordsRemover.loadDefaultStopWords(
        "english") + StopWordsRemover.loadDefaultStopWords("italian")
    remover = StopWordsRemover(inputCol="words",
                               outputCol="filtered_words",
                               stopWords=all_stopwords)
    df = remover.transform(df)
    df = df.filter(size(df.filtered_words) > 0)

    # Remove words smaller that 5 letters and numbers
    df = df.withColumn("filtered_words_2", remove_udf(df["filtered_words"]))
    df = df.filter(size(df.filtered_words_2) > 0)

    # Automatically choose the number of features
    if auto_feats:
        num_feat = df.select("filtered_words_2").withColumn(
            "tokens",
            explode("filtered_words_2")).select("tokens").distinct().count()

    hashingTF = HashingTF(inputCol="filtered_words_2",
                          outputCol="rawFeatures",
                          numFeatures=num_feat)
    featurizedData = hashingTF.transform(df)

    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
    idfModel = idf.fit(featurizedData)

    rescaledData = idfModel.transform(featurizedData)
    rescaledData = rescaledData.select("_c1", "filtered_words_2", "features")

    # Write the dataset to disk. Split it if needed.
    if _split:

        # Count the total rows of the file and generate
        # a shuffled version of the dataset.
        total_rows = rescaledData.count()
        shuffled_df = rescaledData.orderBy(rand(1))

        # Generate dataset with this much rows.
        for s in [1000, 10000, 100000, 1000000]:
            if s <= total_rows:
                new_df = shuffled_df.limit(s)
                new_df.write.parquet(output + "/slice_" + str(s))

        rescaledData.write.parquet(output + "/complete")

    else:
        rescaledData.write.parquet(output)
# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")
stops.transform(tokenized).show()


# COMMAND ----------

from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)

Пример #23
0
def main(topic):
    # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', 'localhost:9092') \
        .option('subscribe', topic)\
        .option('failOnDataLoss', 'false')\
        .option('auto.offset.reset', 'earliest')\
        .load()
    values = messages.select(messages['value'].cast('string'))
    words = values.select(
        functions.explode(functions.split(values.value, ';')).alias("words"))
    data = words.withColumn('text', functions.split('words',
                                                    ',')).select('text')
    data = data.withColumn('news_id', data['text'][0])
    data = data.withColumn('news_keyword', data['text'][1])
    data = data.withColumn('news_url', data['text'][2])
    data = data.withColumn('tweet_url', data['text'][3])
    data = data.withColumn('retweet_count', data['text'][4])
    data = data.withColumn('favorite_count', data['text'][4])
    # data = data.dropDuplicates(['tweet_url', 'news_url'])
    data = data.withColumn('favorite_count',
                           data['favorite_count'].cast(types.IntegerType()))
    # data = data.groupby('tweet_id', 'news_keyword', 'tweet_url', 'news_url').max('favorite_count')
    # data = data.select('news_url', 'tweet_url').distinct()
    # data = data.groupby('news_url', 'tweet_url').agg(
    #     functions.collect_set('news_keyword').alias('news_keywords')
    # )
    #functions.collect_set('tweet_url').alias('tweet_urls')

    # udf_uuid = functions.udf(lambda: str(uuid.uuid4()), returnType=types.StringType())
    # data = data.withColumn('news_id', udf_uuid())
    # data = data.select('news_id', 'news_keyword', 'favorite_count', 'news_url', 'tweet_url')
    print('finish load data')

    # 2. Scrap the news_text and tweets_comments
    data = data.withColumn('tweets_infos', udf_get_comments(data['tweet_url']))
    data = data.withColumn('tweets_info',
                           functions.explode(data['tweets_infos']))
    data = data.select('news_id', 'news_keyword', 'retweet_count',
                       'favorite_count', 'news_url', 'tweet_url',
                       data.tweets_info[0].alias('like_counts'),
                       data.tweets_info[1].alias('comment_time'),
                       data.tweets_info[2].alias('tweets_comment'))
    data = data.withColumn('like_counts',
                           data['like_counts'].cast(types.IntegerType()))
    # data = data.where(data['news_text'].isNotNull() & (functions.length(data['news_text']) > 0))
    data = data.where(data['tweets_comment'].isNotNull() & (functions.length(
        data['tweets_comment']) > 0))  # filter reviews with no text
    print('finish scrap')

    # 3. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words
    data = data.withColumn('sentiment_score',
                           udf_sentiment_score(data['tweets_comment']))
    # news_regex_tokenizer = RegexTokenizer(inputCol='news_text', outputCol='news_words', pattern='[^A-Za-z]+')
    # news_stopwords_remover = StopWordsRemover(inputCol='news_words',
    #                                           outputCol='news_tokens',
    #                                           stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    tweets_regex_tokenizer = RegexTokenizer(inputCol='tweets_comment',
                                            outputCol='tweets_words',
                                            pattern='[^A-Za-z]+')
    tweets_stopwords_remover = StopWordsRemover(
        inputCol='tweets_words',
        outputCol='tweets_tokens',
        stopWords=StopWordsRemover.loadDefaultStopWords('english'))
    # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features')
    nlp_pipeline = Pipeline(
        stages=[tweets_regex_tokenizer, tweets_stopwords_remover])
    model = nlp_pipeline.fit(data)
    nlp_data = model.transform(data).select('news_id', 'news_keyword',
                                            'retweet_count', 'favorite_count',
                                            'news_url', 'tweet_url',
                                            'tweets_comment', 'tweets_tokens',
                                            'sentiment_score', 'like_counts',
                                            'comment_time')

    # 4. Select Features
    # nlp_data = nlp_data.withColumn('news_tokens', udf_morphy(nlp_data['news_tokens']))
    nlp_data = nlp_data.withColumn('tweets_tokens',
                                   udf_morphy(nlp_data['tweets_tokens']))
    # nlp_data = nlp_data.select(nlp_data['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens'))
    # nlp_data = nlp_data.where(functions.size(nlp_data['news_tokens']) > 0)
    nlp_data = nlp_data.where(functions.size(nlp_data['tweets_tokens']) > 0)

    # 5. Calculate Weighted Sentiment Scores
    # nlp_data = nlp_data.withColumn('sentiment_score', udf_sentiment_score(nlp_data['tweets_tokens']))
    nlp_data = nlp_data.withColumn('tweets_tokens',
                                   functions.concat_ws(' ', 'tweets_tokens'))
    # nlp_data = nlp_data.withColumn('classify_tokens', udf_classify_tokens(review['tokens']))
    nlp_data_score = nlp_data.groupby(
        'news_id', 'news_keyword', 'retweet_count', 'favorite_count',
        'news_url', 'tweet_url').agg(
            functions.collect_list('tweets_tokens').alias('tweets_tokens'),
            functions.collect_list('sentiment_score').alias(
                'sentiment_scores'),
            functions.collect_list('like_counts').alias('like_counts'),
            functions.collect_list('comment_time').alias('comment_time'),
            (functions.sum(nlp_data.sentiment_score * nlp_data.like_counts) /
             functions.sum(
                 nlp_data.like_counts)).alias('weighted_sentiment_score'))
    nlp_data_score = nlp_data_score.withColumn(
        'tweets_tokens', functions.concat_ws(' ', 'tweets_tokens'))
    nlp_data_score = nlp_data_score.withColumn(
        'tweets_tokens', udf_classify_tokens(nlp_data_score['tweets_tokens']))
    # nlp_data_score = nlp_data_score.withColumn('tweets_tokens', functions.split('tweets_tokens', '\s+'))
    # nlp_data_score = nlp_data_score.withColumn('news_tokens', functions.concat_ws(' ', 'news_tokens'))
    nlp_data_score = nlp_data_score.withColumn(
        'comment_time', functions.concat_ws(',', 'comment_time'))
    print('finish scores')

    # 6. Save

    # nlp_data_score.write.format("com.mongodb.spark.sql.DefaultSource")\
    #     .mode("append")\
    #     .option("uri", "mongodb://127.0.0.1:27017/news.news_data")\
    #     .option("replaceDocument", False)\
    #     .option("database", "news")\
    #     .option("collection", "news_data").save()

    nlp_data_score = nlp_data_score.withColumn(
        'dl_value',
        functions.to_json(
            functions.struct(
                [nlp_data_score[x] for x in nlp_data_score.columns])))
    stream = nlp_data_score.select(nlp_data_score.news_id.alias("key"),
                                   nlp_data_score.dl_value.alias("value"))\
        .writeStream\
        .format('kafka')\
        .outputMode('complete')\
        .option('kafka.bootstrap.servers', 'localhost:9092')\
        .option("topic", "nlp-2")\
        .option("checkpointLocation", "../check")\
        .start()

    # stream = nlp_data_score.writeStream.format('console').outputMode('complete').start()
    # stream = nlp_data_score.writeStream\
    #     .format('json')\
    #     .outputMode('update')\
    #     .option("path", "/Users/Dao/Documents/BigData/733/project/twitter/streaming/data")\
    #     .option("checkpointLocation", "../check")\
    #     .start()
    stream.awaitTermination()
Пример #24
0
    # df = spark.sql('SELECT * FROM df WHERE df.text NOT CONTAINS "Kardashian" AND NOT CONTAINS "Jenner")
    #tokenize string
    print('Tokenizing Text...')
    tokenizer = Tokenizer(inputCol='text', outputCol='tokens')
    df = tokenizer.transform(df)

    wnl = WordNetLemmatizer()
    print('Lemmatizing Text...')
    lemma_udf = udf(lambda row: lemma(row), ArrayType(StringType()))
    df = df.withColumn('lemmed_tokens', lemma_udf(df.tokens))

    # remove stopwords
    print('Removing Stop Words...')
    swr = StopWordsRemover(inputCol='lemmed_tokens',
                           outputCol='filtered_tokens')
    stops = swr.loadDefaultStopWords('english')
    for stop in stops:
        stop.replace('’', '')
    for word in [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
            'ha', 'wa', 'getty', 'image', 'ap', 'pictwittercom'
    ]:
        stops.append(word)
    swr.setStopWords(stops)
    df = swr.transform(df)

    df = df.select('post_id', 'filtered_tokens')

    print("Post Stop Word Remove")
    df.take(1)
Пример #25
0
                              "ham_spam").withColumnRenamed("_c1", "text")
data = data.withColumn("cleantext", removePunctuationUDF(data.text))

#split data into training and test
split_data = data.randomSplit([0.8, 0.2])
training_data = split_data[0]
test_data = split_data[1]
print("Training data: ", training_data.count())
print("Test data: ", test_data.count())

# COMMAND ----------

from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF
stringIndexer = StringIndexer(inputCol="ham_spam", outputCol="label")
tokenizer = Tokenizer(inputCol="cleantext", outputCol="words")
add_stopwords = StopWordsRemover.loadDefaultStopWords('english')
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=200)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# COMMAND ----------

import shutil
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, PipelineModel
from azureml.core.run import Run
"""
from pyspark.sql.functions import lit, rand
data = true_df.withColumn('fake', lit(0)).union(
    fake_df.withColumn('fake', lit(1))).orderBy(rand())
# Check data
data.groupBy('fake').count().show()
# View concatenated result
data.show(10)

#%%
"""
4.NLP Process
"""
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import StringIndexer, VectorAssembler
StopWordsRemover.loadDefaultStopWords('english')

# 1.Tokenize the title, ignore emoji and etc. regular expression
title_tokenizer = RegexTokenizer(inputCol='title',
                                 outputCol='title_words',
                                 pattern='\\W',
                                 toLowercase=True)

# 2.Remove stopwords from title
title_sw_remover = StopWordsRemover(inputCol='title_words',
                                    outputCol='title_sw_removed')

# 3.Compute Term frequency from title
title_count_vectorizer = CountVectorizer(inputCol='title_sw_removed',
                                         outputCol='tf_title')
Пример #27
0
# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)

# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover

englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")
stops.transform(tokenized).show()

# COMMAND ----------

from pyspark.ml.feature import NGram

unigram = NGram(n=1, inputCol="DescOut", outputCol="unigrams")
unigramDataFrame = unigram.transform(tokenized)
unigramDataFrame.select("unigrams").show(truncate=False)

# COMMAND ----------
                    type=int)
args = parser.parse_args()


spark = SparkSession.builder \
    .appName("get frequent words") \
    .config("spark.sql.catalogImplementation", "in-memory") \
    .getOrCreate()
sc = spark.sparkContext

if args.stop_word_lang or args.stop_word_file:
    sentences = sc.textFile(
        args.txtPath).map(lambda row: Row(sentence_raw=row.split(" "))).toDF()
    stopWords = []
    if args.stop_word_lang:
        stopWords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang)
    if args.stop_word_file:
        stopWords += sc.textFile(args.stop_word_file).collect()
    remover = StopWordsRemover(inputCol="sentence_raw",
                               outputCol="sentence",
                               stopWords=stopWords)
    sentences = remover.transform(sentences)
else:
    sentences = sc.textFile(
        args.txtPath).map(lambda row: Row(sentence=row.split(" "))).toDF()

words = sentences.rdd.map(lambda row: row.sentence).flatMap(lambda x: x)

wordCounts = words.map(lambda w: (w, 1)) \
    .reduceByKey(add) \
    .filter(lambda w: w[1] >= args.min_count) \
Пример #29
0
#Check if all the params were passed
if (len(sys.argv) > 5):
    #Setup the sparkContext
    sc = SparkContext(appName="SparkClustering-emonto15-dperezg1")
    spark = SparkSession(sc)
    #Read from hdfs and save using a schema (path,text)
    files = sc.wholeTextFiles("hdfs://" + sys.argv[1])
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    df = spark.createDataFrame(files, schema)
    #Divide the text into an array of words
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #Setup the language to remove the stopwords
    StopWordsRemover.loadDefaultStopWords(sys.argv[4])
    #Read from column tokens (which is the output of the tokenizer object) and save a new array of words without the stopwords
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #Creates a hash of each word and the frecuency on each document and only takes the number of words established on the numFeatures parameter
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=int(sys.argv[3]))
    #Calculates the inverse document frecuency, and ignore a word if  well explained on the code's article
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
    #Initialize the kmeans with a specific K
    kmeans = KMeans(k=int(sys.argv[2]))
    #Declare the assambly line to transform the dataset
    #creacion del mapa de transformaciones
    pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans])
    #Apply the assambly line to the dataset
Пример #30
0
winz = 5
word_nsamps = 10
rm_stop = True
language = "spanish"
# Added the jar driver to the $SPARK_HOME/jars directory:
# Downloaded from: https://bitbucket.org/xerial/sqlite-jdbc/downloads/sqlite-jdbc-3.8.6.jar
spark = SparkSession.builder.getOrCreate()

df = spark.read.text(input_txt).select(removePunctuation(F.col('value')))
tokenizer = Tokenizer(inputCol="sentence", outputCol="toks" if rm_stop else "tokens")  
df = tokenizer.transform(df)
if rm_stop:
    remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                            outputCol="tokens",
                            stopWords=None if language == "english" else 
                                StopWordsRemover.loadDefaultStopWords(language))
    df = remover.transform(df)

# Now the magic of windowing the text with F.explode()
win = windowing(winz)
decompose = win.get_udf()
df = df.withColumn("slides", decompose("tokens")) \
        .withColumn("exploded", F.explode("slides")) \
        .withColumn("word", get_mid("exploded")) \
        .withColumn("window", rm_mid("exploded"))
        
df = df.drop(*[c for c in df.columns if not c in ["word", "window"]])

indexer = StringIndexer(inputCol="word", outputCol="label")
df = indexer.fit(df).transform(df)  #.persist(StorageLevel.DISK_ONLY)#MEMORY_AND_DISK)
Пример #31
0
def create_w2v_model():
    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.memory.offHeap.enabled", True) \
        .config("spark.memory.offHeap.size", "2g") \
        .getOrCreate()

    input_file = spark.sparkContext.wholeTextFiles(PATH)

    print("""
    
    Подготовка данных (1)...
    
    """)
    prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1])))

    print("""
    
    Подготовка данных (2)...
    
    """)
    df = prepared_data.toDF()

    print("""
    
    Подготовка данных (3)...
    
    """)
    prepared_df = df.selectExpr('_2 as text')

    print("""
    
    Разбитие на токены...
    
    """)
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    print("""
    
    Очистка от стоп-слов...
    
    """)
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol="words",
                               outputCol="filtered",
                               stopWords=stop_words)

    print("""
    
    Построение модели...
    
    """)
    word2Vec = Word2Vec(vectorSize=50,
                        inputCol='words',
                        outputCol='result',
                        minCount=2)
    model = word2Vec.fit(words)

    print("""
    
    Сохранение модели...
    
    """)
    today = datetime.datetime.today()
    model_name = today.strftime("model/kurs_model")
    print("""
    
    Model  """ + model_name + """  saved
    
    """)
    model.save(model_name)

    spark.stop()
Пример #32
0
df = df.withColumn('text', f.regexp_replace('text', 'http\S+\s*', ''))
df = df.withColumn('text', f.regexp_replace('text', 'RT|cc', ''))
df = df.withColumn('text', f.regexp_replace('text', '@\S+', ''))

geonames = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true', inferschema='true').load(
        's3://emrtestticksnl/TestRic/NL_geonames_triGram_townprovinces.csv'
    ).select(f.lower(f.col('asciiname')), 'latitude', 'longitude')
geonames = geonames.withColumnRenamed("lower(asciiname)", "placename")
geonames = geonames.dropDuplicates(['placename'])

regexTokenizer = RegexTokenizer(inputCol="text",
                                outputCol="words",
                                pattern="\\W")
# stop words
dutchwords = StopWordsRemover.loadDefaultStopWords('dutch')
englishwords = StopWordsRemover.loadDefaultStopWords('english')
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "co", "@"
                 ] + dutchwords + englishwords
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

########Stemmer definition

dataset1 = dataset.select("filtered")