def tf_idf_feature(wordsData):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    for features_label in rescaledData.select("features", "id").take(3):
        print(features_label)
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
示例#3
0
def extract_idf_features(p_df, input_col, output_col):
    """
    Extracts IDF features.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.    
    """    
    idf = IDF(inputCol=input_col, outputCol=output_col)
    idfModel = idf.fit(p_df)
    return idfModel.transform(p_df)
示例#4
0
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):

    global idfModel
    
    hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
    featurizedData = hashingTF.transform(dataframe)
    idf = IDF(inputCol=in_col2, outputCol=out_col2)
    idfModel = idf.fit(featurizedData)
    dataframe = idfModel.transform(featurizedData)
    
    return dataframe
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col):
    #from pyspark.sql.functions import udf
    #from pyspark.sql.types import *
    output_raw_col = ip_col+"raw_features"
    output_col = ip_col+"features"
    hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
    featurizedData = hashingTF.transform(df)
    idf = IDF(inputCol=output_raw_col, outputCol=output_col)
    idfModel = idf.fit(featurizedData)
    rescaled_data = idfModel.transform(featurizedData)
    rescaled_data.show(5)
    print(rescaled_data.count())
    return rescaled_data
示例#8
0
文件: project.py 项目: sam46/Yelper
def makeTFIDF(sc, spark, reviews):
    # count vectorizer and tfidf
    # cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
    # cvModel = cv.fit(reviews)
    # reviews = cvModel.transform(reviews)

    # HashingTF for fewer dimensions:
    hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
    reviews = hashingtf.transform(reviews)

    # create TF-IDF matrix
    idf = IDF().setInputCol('tf').setOutputCol('tfidf')
    tfidfModel = idf.fit(reviews)
    reviews = tfidfModel.transform(reviews)
示例#9
0
 def test_idf(self):
     dataset = self.spark.createDataFrame([
         (DenseVector([1.0, 2.0]),),
         (DenseVector([0.0, 1.0]),),
         (DenseVector([3.0, 0.2]),)], ["tf"])
     idf0 = IDF(inputCol="tf")
     self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol])
     idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"})
     self.assertEqual(idf0m.uid, idf0.uid,
                      "Model should inherit the UID from its parent estimator.")
     output = idf0m.transform(dataset)
     self.assertIsNotNone(output.head().idf)
     # Test that parameters transferred to Python Model
     check_params(self, idf0m)
示例#10
0
文件: ml.py 项目: ribonj/lsir
def tf_idf(df, column):
    """
    Compute TF-IDF of a corpus.
    Transformation: array<string> --> vector
    """ 
    df = preprocess(df, column) # text to list of terms
    (df, voc) = count(df, column)
    
    # creates a TF-IDF model and uses it to compute the feature vector.
    idf = IDF(inputCol=column, outputCol='_'+column)
    model = idf.fit(df)
    df = model.transform(df)
    
    df = replace(df, column, '_'+column)
    return (df, voc)
示例#11
0
 def append_tf_idf(self, df):
     """
     Calculate term frequency and inverse document frequency
      based on at least 1 visit hourly in this case. Compares how often the tokens appeared
      at least once per hour compared to other tokens. Not used for the main purpose of the project.
     Args:
         :param df: Dataframe parameter.
     Returns:
         :return:  Dataframe with term frequency and inverse document frequency added in the columns
                     'rawFeatures' and 'features' respectively.
     """
     #Create TF column.
     hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
     tf = hashingTF.transform(df)
     tf.persist(StorageLevel.MEMORY_AND_DISK)
     #Create IDF column.
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idfModel = idf.fit(tf)
     tfidf = idfModel.transform(tf)
     return tfidf
示例#12
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
示例#13
0
def train_model_sentences_with_person():
    sentences_with_person_collection = get_db_collection_object(
        'SentencesWithPerson')

    with open("sentences_with_person.txt", "w",
              encoding='utf-8') as file_sentences_with_person:
        for sen in sentences_with_person_collection.find():
            file_sentences_with_person.write('{0}\n'.format(sen['sentence']))

    spark = SparkSession \
        .builder \
        .appName("SentenceProcessor") \
        .getOrCreate()

    input_data = spark.sparkContext.textFile('./sentences_with_person.txt')
    prepared_data = input_data.map(lambda x: (x, len(x)))
    prepared_data = prepared_data.filter(lambda x: x[1] > 0)

    prepared_df = prepared_data.toDF().selectExpr('_1 as sentence',
                                                  '_2 as length')
    # prepared_df.show(truncate=False)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    words_data = tokenizer.transform(prepared_df)
    # words_data.show(truncate=False)

    # Отфильтровать токены, оставив только слова
    filtered_words_data = words_data.rdd.map(
        lambda x: (x[0], x[1], get_only_words(x[2])))
    filtered_df = filtered_words_data.toDF().selectExpr(
        '_1 as sentence', '_2 as length', '_3 as words')
    # filtered_df.show()

    # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.)
    stop_words = stopwords.words('russian')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=stop_words)
    filtered = remover.transform(filtered_df)

    #
    normalize_words_data = filtered.rdd.map(
        lambda x: (x[0], x[1], x[2], normalization_sentence(x[3])))
    normalized_df = normalize_words_data.toDF().selectExpr(
        '_1 as sentence', '_2 as length', '_3 as words',
        '_4 as normalize_words')
    # normalized_df.show()

    #
    vectorizer = CountVectorizer(inputCol='normalize_words',
                                 outputCol='raw_features').fit(normalized_df)
    featurized_data = vectorizer.transform(normalized_df)
    featurized_data.cache()

    #
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_data)
    rescaled_data = idf_model.transform(featurized_data)

    # Построить модель Word2Vec
    word2Vec = Word2Vec(vectorSize=300,
                        minCount=0,
                        inputCol='normalize_words',
                        outputCol='result')
    doc2vec_pipeline = Pipeline(stages=[tokenizer, word2Vec])
    model = word2Vec.fit(rescaled_data)
    w2v_df = model.transform(rescaled_data)
    # w2v_df.show(truncate=False)

    # print(model.findSynonyms('бочаров', 2).show())

    # sc = spark.sparkContext
    path = './models/model_person'
    #
    # print(sc, path)
    model.write().overwrite().save(path)

    #m = Word2Vec.load('./models/model_person/')
    # pickle.dump(model, './models/model_person/mp.model')

    spark.stop()
示例#14
0
# Step 1: split text field into words
tokenizer = Tokenizer(inputCol="text_clean", outputCol="text_token")
fulldata = tokenizer.transform(fulldata)
print "Tokenized Text:"
print fulldata.head()
print "################"
# Step 2: compute term frequencies
hashingTF = HashingTF(inputCol="text_token", outputCol="tf", numFeatures=10000)
fulldata = hashingTF.transform(fulldata)
print "TERM frequencies:"
print fulldata.head()
print "################"
# Step 3: compute inverse document frequencies
idf = IDF(inputCol="tf", outputCol="tf_idf")
idfModel = idf.fit(fulldata)
fulldata = idfModel.transform(fulldata)
print "IDF :"
print fulldata.head()
print "################"

#OK we do the same for the search term
# Step 1: split text field into words
tokenizer = Tokenizer(inputCol="search_term_clean", outputCol="search_token")
fulldata = tokenizer.transform(fulldata)
print "Tokenized Search:"
print fulldata.head()
print "################"
# Step 2: compute term frequencies
hashingTF = HashingTF(inputCol="search_token",
                      outputCol="tf_s",
示例#15
0
   .load(os.path.realpath("Womens Clothing E-Commerce Reviews.csv"))

reviews = data.map(lambda x : x['Review Text']).filter(lambda x: x is not None)

tokens = reviews                                                   \
    .map( lambda document: document.strip().lower())               \
    .map( lambda document: re.split(" ", document))          \
    .map( lambda word: [x for x in word if x.isalpha()])           \
    .map( lambda word: [x for x in word if len(x) > 3] )           \
    .map( lambda word: [x for x in word if x not in StopWords])    \
    .zipWithIndex()



row_rdd = rdd1.map(lambda x: Row(x))
df=sqlContext.createDataFrame(row_rdd,['numbers']).show()
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
model = cv.fit(df)
result = model.transform(df)
result.show(truncate=False)
from pyspark.sql.functions import monotonicallyIncreasingId
res = df.withColumn("id", monotonicallyIncreasingId())
df_txts = sqlContext.createDataFrame(row_rdd, ["list_of_words",'index'])

idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 



示例#16
0
def transform(spark, s3_input_data, s3_output_train_data,
              s3_output_validation_data, s3_output_test_data):
    print('Processing {} => {}'.format(s3_input_data, s3_output_train_data,
                                       s3_output_validation_data,
                                       s3_output_test_data))

    schema = StructType([
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])

    df_csv = spark.read.csv(path=s3_input_data,
                            sep='\t',
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()

    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)

    hashingTF = HashingTF(inputCol='words',
                          outputCol='raw_features',
                          numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features')  #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('star_rating', 'features').show()

    num_features = 300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select(
        'star_rating', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features',
                                     outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(
        pca_features_df).select('star_rating', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn(
        'f', to_array(col('scaled_pca_features'))).select(
            ['star_rating'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    train_df, validation_df, test_df = expanded_features_df.randomSplit(
        [0.9, 0.05, 0.05])

    train_df.write.csv(path=s3_output_train_data, header=None, quote=None)  #,
    print('Wrote to output file:  {}'.format(s3_output_train_data))

    validation_df.write.csv(path=s3_output_validation_data,
                            header=None,
                            quote=None)  #,
    print('Wrote to output file:  {}'.format(s3_output_validation_data))

    test_df.write.csv(path=s3_output_test_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_test_data))
示例#17
0
def main(root_path):
    timeStamp = str(int(time()))
    # todo change this for full run
    num = 1000  # 128915 is the total
    out_file_name = '../out/output-' + timeStamp + "-" + str(num) + '.txt'
    out_file = open(out_file_name, 'w')

    start = time()
    spark = init_spark()
    json_files = read_json_files(root_path, spark, num)
    data = get_body_text(spark, json_files)
    print("data reading done")

    # clean the data
    word_clean_up_F = F.udf(lambda x: clean_up(x), StringType())
    data = data.withColumn("body_text_cleaned", word_clean_up_F("body_text"))
    data = data.select("body_text_cleaned")
    print("data processing done")

    tokenizer = Tokenizer(inputCol="body_text_cleaned", outputCol="words")
    token_DataFrame = tokenizer.transform(data)
    token_DataFrame = token_DataFrame.select("words")

    # Remove stopwords
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    cleaned_DataFrame = remover.transform(token_DataFrame)
    cleaned_DataFrame = cleaned_DataFrame.select('filtered')

    # Count vectorizer
    cv_tmp = CountVectorizer(inputCol="filtered", outputCol="count_features")
    cvmodel = cv_tmp.fit(cleaned_DataFrame)
    count_dataframe = cvmodel.transform(cleaned_DataFrame)
    count_dataframe = count_dataframe.select('count_features')

    # TF-IDF Vectorizer
    tfidf = IDF(inputCol="count_features", outputCol="features")
    tfidfmodel = tfidf.fit(count_dataframe)
    tfidf_dataframe = tfidfmodel.transform(count_dataframe).select("features")

    print("Ready to fit with the LDA model")
    # Fit the LDA Model
    num_topics = 5
    max_iterations = 20
    lda_start = time()
    lda = LDA(seed=1, optimizer="em", k=num_topics, maxIter=max_iterations)
    lda_model = lda.fit(tfidf_dataframe)
    lda_transformed = lda_model.transform(tfidf_dataframe)
    lda_end = time()
    print("LDA complete")
    # joblib.dump(lda_model, 'lda.csv')

    # Get terms per topic
    topics = lda_model.topicsMatrix()
    vocabArray = cvmodel.vocabulary

    wordNumbers = 15  # number of words per topic
    topicIndices = lda_model.describeTopics(maxTermsPerTopic=wordNumbers).rdd.map(tuple)

    topics_final = topicIndices.map(lambda topic: topic_render(topic, wordNumbers, vocabArray)).collect()

    for topic in range(len(topics_final)):
        print("Topic " + str(topic) + ":")
        print("Topic " + str(topic) + ":", file=out_file)
        print(topics_final[topic])
        print(topics_final[topic], file=out_file)

    print("Full runtime : {} min. ".format((time() - start) / 60))
    print("LDA runtime : {} min. ".format((lda_end - lda_start) / 60))
    print("Check" + out_file.name)

    cleaned_DataFrame.cache()
    lda_transformed.cache()

    # Data Visualization
    data = format_data_to_pyldavis(cleaned_DataFrame, cvmodel, lda_transformed, lda_model)
    print("Preparing data with pyLDAvis ...")
    filter_bad_docs(data)
    py_lda_prepared_data = pyLDAvis.prepare(**data)
    file_name = '../out/data-viz-' + timeStamp + '.html'
    print("Saving pyLDAvis html page ...")
    pyLDAvis.save_html(py_lda_prepared_data, file_name)
    pyLDAvis.show(py_lda_prepared_data)
    spark.stop()
示例#18
0
    nb_features = max(nb_features_train, nb_features_test)
    print(nb_features)
    nb_features = 5000
    print("\nDone : Tokenization training and test sets")

    ###########################################################################
    #########             TF IDF Training and Test Set                #########

    #training set
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=nb_features)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    #rescaledData.select("label", "features").show()

    #test_set
    hashingTF_test = HashingTF(inputCol="words",
                               outputCol="rawFeatures",
                               numFeatures=nb_features)
    featurizedData_test = hashingTF_test.transform(wordsData_test)

    #    idf = IDF(inputCol="rawFeatures", outputCol="features")
    #    idfModel = idf.fit(featurizedData_test)
    rescaledData_test = idfModel.transform(featurizedData_test)

    rescaled_test_df = rescaledData_test.select("features")
示例#19
0
# stemmer = LancasterStemmer()
# stemmer_udf = udf(
#     lambda tokens: [stemmer.stem(token) for token in tokens],
#     ArrayType(StringType())
# )
# data_df_stemmed = data_df_filtered.withColumn("wordsStemmed", stemmer_udf("words"))

# # hashing term frequency
# hashing_term_freq = \
#     HashingTF(inputCol="wordsStemmed", outputCol="featuresRaw", numFeatures=5000)
# data_df_tf = hashing_term_freq.transform(data_df_stemmed)

# hashing term frequency
hashing_term_freq = \
    HashingTF(inputCol="words", outputCol="featuresRaw", numFeatures=5000)
data_df_tf = hashing_term_freq.transform(data_df_filtered)

# inverse document frequency
inv_doc_freq = IDF(inputCol="featuresRaw", outputCol="features", minDocFreq=5)
inv_doc_freq_fitted = inv_doc_freq.fit(data_df_tf)
data_df_tfidf = inv_doc_freq_fitted.transform(data_df_tf)

# encode classes
indexer = StringIndexer(inputCol="category", outputCol="label")
indexer_fitted = indexer.fit(data_df_tfidf)
data_prepared_df = indexer_fitted.transform(data_df_tfidf)

# predict
log_reg_fitted = LogisticRegressionModel.load("output/reviews_model.model")
test_pred_df = log_reg_fitted.transform(data_prepared_df)
示例#20
0
+-----+--------------------+--------------------+
"""
words_data.show(truncate=False)
"""
+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|0.0  |I wish java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+
"""
hasing_tf=HashingTF(inputCol='words',outputCol='rawFeatures')
featurized_data= hasing_tf.transform(words_data)
idf=IDF(inputCol='rawFeatures',outputCol='features')
idf_model=idf.fit(featurized_data)
rescaled_data=idf_model.transform(featurized_data)
rescaled_data.select('label','features').show()
"""
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262144,[24417,49...|
|  0.0|(262144,[20719,24...|
|  1.0|(262144,[13671,91...|
+-----+--------------------+
"""
rescaled_data.select('label','features').show(truncate=False)
"""
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
示例#21
0
def text_clustering(dataFrame,
                    k_value,
                    w2v=False,
                    w2v_value=None,
                    seed=2137,
                    normalize=True,
                    plot=True):
    """
    args:
        -dataFrame: spark Data Frame
        -k_value: number of clusters in k-means algorithm
        -w2v: if True word2Vec is used and w2v_value must be specified, otherwise tf-idf is used
        -w2v_value: number of parameters to be returned with Word2Vec
        -seed: seed
        -normalize: should normalization after Word2Vec be performed?
        -plot: if True, clusters are visualized with the use of PCA
        
    """

    #Data preprocessing
    tokenizer = Tokenizer(inputCol="text", outputCol="words_raw")
    dataFrame = tokenizer.transform(dataFrame)
    remover = StopWordsRemover(inputCol="words_raw", outputCol="words")
    dataFrame = remover.transform(dataFrame)

    if w2v and w2v_value is None:
        raise ValueError('You have to give w2v_values parameter')

    if not w2v:  #tf-idf
        hashingTF = HashingTF(inputCol="words_raw",
                              outputCol="rawFeatures",
                              numFeatures=20)
        featurizedData = hashingTF.transform(dataFrame)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        memes_df = idfModel.transform(featurizedData)

    else:  #word2vec
        word2Vec = Word2Vec(vectorSize=w2v_value,
                            seed=seed,
                            inputCol="words",
                            outputCol="features_unnormalized")
        model_w2v = word2Vec.fit(dataFrame)
        memes_df = model_w2v.transform(dataFrame)
        model_w2v.write().overwrite().save("hdfs:///models/model_w2v")

        if normalize:
            scaler = StandardScaler(inputCol="features_unnormalized",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=True)
            scalerModel = scaler.fit(memes_df)
            memes_df = scalerModel.transform(memes_df)

    #kmeans
    kmeans = KMeans(k=k_value, seed=seed)
    model_kmeans = kmeans.fit(memes_df)
    memes_df = model_kmeans.transform(memes_df)
    model_kmeans.write().overwrite().save("hdfs:///models/model_kmeans")

    #clustering evaluation
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(memes_df)

    centers = model_kmeans.clusterCenters()

    if plot:

        import matplotlib.pyplot as plt  #virtual environment might have problems if imported "the classical" way

        #pca
        pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
        model_pca = pca.fit(memes_df)
        memes_df = model_pca.transform(memes_df)
        #memes_df.show()

        centers_pca = [None] * len(centers)
        for i in range(len(centers)):
            centers_pca[i] = np.multiply(model_pca.pc.toArray().T,
                                         centers[i]).sum(axis=1)
        centers_pca = np.array(centers_pca)

        #plot section
        split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()),
                                    ',')
        memes_df = memes_df.withColumn(
            'x',
            translate(split_col.getItem(0), "[", "").cast(DoubleType()))
        memes_df = memes_df.withColumn(
            'y',
            translate(split_col.getItem(1), "]", "").cast(DoubleType()))
        #memes_df.show(truncate = False)

        df = memes_df.toPandas()
        groups = df.groupby('prediction')
        fig, ax = plt.subplots()
        ax.margins(0.05)
        for name, group in groups:
            ax.plot(group.x,
                    group.y,
                    marker='o',
                    linestyle='',
                    ms=5,
                    label=name)
            ax.text(centers_pca[name, 0],
                    centers_pca[name, 1],
                    s=name,
                    fontsize=10)
        ax.legend()
        ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format(
            k_value, w2v_value, silhouette))
        plt.show()
        print("PCA, explained variance= {0}".format(
            model_pca.explainedVariance))

    return memes_df
示例#22
0
df_seg.show()
# 将分词做成ArrayType()
tokenizer = Tokenizer(inputCol='seg', outputCol='words')
df_seg_arr = tokenizer.transform(df_seg).select('words', 'label')
df_seg_arr.show()

# 切词后的文本特征处理
tf = HashingTF(numFeatures=1 << 18,
               binary=False,
               inputCol='words',
               outputCol='rawfeatures')
df_tf = tf.transform(df_seg_arr).select('rawfeatures', 'label')
df_tf.show()

idf = IDF(inputCol='rawfeatures', outputCol='features')
idfModel = idf.fit(df_tf)
df_tf_idf = idfModel.transform(df_tf)
df_tf_idf.show()

# label数据处理
stringIndexer = StringIndexer(inputCol='label',
                              outputCol='indexed',
                              handleInvalid='error')
indexer = stringIndexer.fit(df_tf_idf)
df_tf_idf_lab = indexer.transform(df_tf_idf).select('features', 'indexed')
df_tf_idf_lab.show()

# 切分训练集和预测集
splits = df_tf_idf_lab.randomSplit([0.7, 0.3], 123)
train = splits[0]
test = splits[1]
                        outputCol="bucketized_features")
bucketed_df = bucketizer.transform(b_df)
bucketed_df.show()

# Text tokenization
from pyspark.ml.feature import Tokenizer

sentence_df = spark.createDataFrame(
    [(1, "Introduction to Spark MLlib"),
     (2, "MLlib includes libraries for classification and regression"),
     (3, "Also supports pipelines")], ["id", "sentence"])

sent_tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
sent_tokenized_df = sent_tokenizer.transform(sentence_df)

print(sent_tokenized_df.show())

# TF-IDF
from pyspark.ml.feature import HashingTF, IDF

# TF
hashingTF = HashingTF(inputCol="words",
                      outputCol="raw_features",
                      numFeatures=20)
sent_hf_tf_df = hashingTF.transform(sent_tokenized_df)
print(sent_hf_tf_df.take(1))

# IDF
idf = IDF(inputCol="raw_features", outputCol="idf_features")
idf_model = idf.fit(sent_hf_tf_df)
tf_idf_df = idf_model.transform(sent_hf_tf_df)
示例#24
0
        "filtered").setOutputCol("features")
    model = word2vec.fit(train_set)
    train_set1 = model.transform(train_set)
    test_set1 = model.transform(test_set)

    # now use tf-idf
    hashingTF = HashingTF().setNumFeatures(1000).setInputCol(
        "filtered").setOutputCol("rawFeatures")
    idf = IDF().setInputCol("rawFeatures").setOutputCol(
        "features").setMinDocFreq(10)
    pipeline = Pipeline(stages=[hashingTF, idf])
    train_set2 = pipeline.fit(train_set).transform(train_set)
    pipeline2 = Pipeline(stages=[hashingTF])
    test_set2 = pipeline2.fit(train_set).transform(
        test_set)  # use trainset idf to transform test set
    test_set2 = idf.fit(test_set2).transform(test_set2)

    # ======================================================================================
    #                                       Fit Model
    # ======================================================================================
    def fit_nb(train):
        rf = RandomForestClassifier(numTrees=20,
                                    maxDepth=20,
                                    labelCol="label",
                                    seed=42)
        model = rf.fit(train)
        return model

    def get_predictions(model, test):
        result = model.transform(
            test.select('features'))  # result is a DataFrame
示例#25
0
# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)

# COMMAND ----------

idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)

# COMMAND ----------

from pyspark.ml.feature import Word2Vec
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame(
    [("Hi I heard about Spark".split(" "), ),
     ("I wish Java could use case classes".split(" "), ),
     ("Logistic regression models are neat".split(" "), )], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="text",
                    outputCol="result")
model = word2Vec.fit(documentDF)
                            minDF=10.0)
result_cv = count_vec.fit(refined_df).transform(refined_df)

# result_cv.select(['Clothing ID','refined_tokens','raw_features']).show(4,False)
vocabArray = count_vec.fit(refined_df).vocabulary

# Tf-idf
# from pyspark.ml.feature import HashingTF,IDF

# hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')
# hashing_df=hashing_vec.transform(refined_df)
# hashing_df.select(['refined_tokens','tf_features']).show(4,False)

tf_idf_vec = IDF(inputCol='tf_features', outputCol='features')

tf_idf_df = tf_idf_vec.fit(result_cv).transform(result_cv)
# tf_idf_df.select(['user_id','tf_idf_features']).show(4,False)

# tf_idf_df.cache()
tf_idf_df.persist(storageLevel=pyspark.StorageLevel.MEMORY_AND_DISK)

from pyspark.ml.clustering import LDA, LDAModel

num_topics = 10
max_iterations = 10
lda = LDA(k=num_topics, maxIter=max_iterations)
# lda_model = lda.fit(tf_idf_df[['index','features']].rdd.map(list))

model = lda.fit(tf_idf_df)

ll = model.logLikelihood(tf_idf_df)
示例#27
0
Words_Rdd = tokenizer.transform(X_Rdd)

#  Display the first ten rows of the DataFrame with 
#  three columns: label, message and the tokenized words
Words_Rdd.show(10)

# Apply CountVectorizer which  CountVectorizer converts the word tokens 
# into vectors of token counts. 
token_counts = CountVectorizer (inputCol="words", outputCol="new_features")
model =token_counts.fit(Words_Rdd)
featurized_Rdd = model.transform(Words_Rdd)
featurized_Rdd.show(10)

# Apply Term Frequency–Inverse Document Frequency (TF-IDF)
idf = IDF(inputCol="new_features", outputCol="features")
idfModel = idf.fit(featurized_Rdd)
rescaled_Rdd = idfModel.transform(featurized_Rdd)
rescaled_Rdd.select("label", "features").show(10)

# Split the dataset into Training data  = 80% and Testing data = 20%
# Set seed for reproducibility zero
seed = 0  
train_df, test_df = rescaled_Rdd.randomSplit([0.8,0.2],seed)

# Number of records of each dataframe
train_df.count()
test_df.count()

# FIT the Naïve Bayes classifier
nb = NaiveBayes()
paramGrid_nb = ParamGridBuilder().addGrid(nb.smoothing, np.linspace(0.3, 10, 5)).build()
示例#28
0
#After tokenizer, we added one col(Words to table)
words_Data = tokenizer_word.transform(reviews)
#words_Data.show(5)

#Count the number of words
countvectorizer = CountVectorizer(inputCol="Words", outputCol="raw_features")
model = countvectorizer.fit(words_Data)

#Add count to our table
get_count_data = model.transform(words_Data)
#get_count_data.show(5)

#calculate TF-IDF
idf_value = IDF(inputCol="raw_features", outputCol="idf_value")
idf_model = idf_value.fit(get_count_data)
final_rescaled_data = idf_model.transform(get_count_data)
#final_rescaled_data.show(5)

# final_rescaled_data.select("idf_value").show()
#vocabulary list
vocabalary = model.vocabulary


#Block 3
def extract(value):
    return {
        vocabalary[i]: float(tfidf_value)
        for (i, tfidf_value) in zip(value.indices, value.values)
    }
示例#29
0
#Merge product with words

fulldata = sqlContext.createDataFrame(fulldata.rdd.map((enlargeTokenAndClean)))                      
print "words enlarge with desc and title"
print fulldata.head()
print "################"                                    

# Step 2: compute term frequencies
hashingTF = HashingTF(inputCol="wordsF", outputCol="tf")
fulldata = hashingTF.transform(fulldata)
print "TERM frequencies:"
print fulldata.head()
print "################"
# Step 3: compute inverse document frequencies
idf = IDF(inputCol="tf", outputCol="tf_idf")
idfModel = idf.fit(fulldata)
fulldata = idfModel.transform(fulldata)
print "IDF :"
print fulldata.head()
print "################"

# Step 4 new features column / rename old
fulldata = sqlContext.createDataFrame(fulldata.rdd.map(addFeatureLen))
fulldata = sqlContext.createDataFrame(fulldata.rdd.map(newFeatures))
print "NEW features column :"
print fulldata.head()
print "################"


# Step 5: ALTERNATIVE ->ADD column with number of terms as another feature
#fulldata = sqlContext.createDataFrame(fulldata.rdd.map(
# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)


# COMMAND ----------

idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)


# COMMAND ----------

from pyspark.ml.feature import Word2Vec
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text",
  outputCol="result")
model = word2Vec.fit(documentDF)
def main():
    sc = SparkSession.builder.appName("SentencingAnalyzer")\
        .config("spark.driver.memory", "10G")\
        .getOrCreate()

    # main df
    cases = sc.read.json("../data/sentencingCases2.jsonl")
    df = cleanDf(cases)

    # read categorized csv
    categorizedCsv = sc.read.csv("../data/categorized.csv", header=True)
    categorizedCsv = categorizedCsv.select(
        'caseName',
        f.split(f.col("type"), " - ").alias('offenseType'), 'duration1',
        'sentenceType1')

    # create the search df
    df = extractOffenseKeywords(df)
    df.cache()
    dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"])

    # CLASSIFICATION OF OFFENSE
    hashingTF = HashingTF(inputCol="offenseKeywords",
                          outputCol="rawFeatures",
                          numFeatures=1000)
    result = hashingTF.transform(df)
    resultSearch = hashingTF.transform(dfSearch)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(result)
    rescaledData = idfModel.transform(result).filter(
        f.size('offenseKeywords') > 0)
    idfModelSearch = idf.fit(resultSearch)
    rescaledDataSearch = idfModelSearch.transform(resultSearch)

    mh = MinHashLSH(inputCol="features",
                    outputCol="hashes",
                    seed=12345,
                    numHashTables=20)
    modelMH = mh.fit(rescaledData)
    transformedData = modelMH.transform(rescaledData)

    modelMHSearch = mh.fit(rescaledDataSearch)
    transformedDataSearch = modelMH.transform(rescaledDataSearch)

    categorizedDf = modelMHSearch.approxSimilarityJoin(
        transformedDataSearch,
        transformedData,
        0.89,
        distCol="JaccardDistance")
    distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \
        .orderBy('caseID', 'JaccardDistance')
    distanceDf = distanceDf.groupBy('caseID').agg(
        f.collect_list('term').alias('predictedOffences'),
        f.collect_list('JaccardDistance').alias('JaccardDistances'))
    distanceDf.cache()
    distanceDf.show()

    # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION
    distanceDfEval = distanceDf.join(
        categorizedCsv, distanceDf.caseID == categorizedCsv.caseName)
    distanceDfEval = distanceDfEval.filter(
        distanceDfEval.offenseType[0] != "N/A").filter(
            distanceDfEval.offenseType[0] != "multiple party sentence")
    calcuateDifferenceInPredictedVsActualOffences_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffences, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "error",
        calcuateDifferenceInPredictedVsActualOffences_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "pctCorrect",
        calcuateDifferenceInPredictedVsActualOffencesPercentage_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    distanceDfEval.select('caseID', 'predictedOffences', 'offenseType',
                          'JaccardDistances', 'error',
                          'pctCorrect').show(200, truncate=False)
    rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] /
            distanceDfEval.count())**(1.0 / 2)
    print("Offense category RMSE:", rmse)
    pctCorrectOffense = (distanceDfEval.groupBy().agg(
        f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100
    print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
def login():
    message = ''
    e_result = ''
    s_result = ''
    t_result = ''
    j_result = ''

    if request.method == 'POST':
        post = request.form.get('text')  # access the data inside

        if len(post) >= 100:

            test = pd.DataFrame([post], columns=['post'])

            newrows = []

            def filter_text(post):
                """Decide whether or not we want to use the post."""
                # should remove link only posts here
                return len(post) > 0

            reg_punc = re.compile('[%s]' % re.escape(string.punctuation))

            def preprocess_text(post):
                """Remove any junk we don't want to use in the post."""

                # Remove links
                post = re.sub(r'http\S+', '', post, flags=re.MULTILINE)

                # All lowercase
                post = post.lower()

                # Remove puncutation
                post = reg_punc.sub('', post)

                return post

            def create_new_rows(row):
                posts = row['post']
                rows = []

                # for p in posts:
                p = preprocess_text(posts)
                rows.append({'post': p})
                return rows

            for index, row in test.iterrows():
                newrows += create_new_rows(row)

            test = pd.DataFrame(newrows)

            df = spark.createDataFrame(test)

            # Create a length column to be used as a future feature
            df = df.withColumn('length', length(df['post']))

            types = [
                'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
                'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
            ]
            types = [x.lower() for x in types]

            tokenizer = Tokenizer(inputCol="post", outputCol="words")
            tokenized = tokenizer.transform(df)

            # Remove stop words
            stopwordList = types
            stopwordList.extend(StopWordsRemover().getStopWords())
            stopwordList = list(set(stopwordList))  #optionnal
            remover = StopWordsRemover(inputCol="words",
                                       outputCol="filtered",
                                       stopWords=stopwordList)
            newFrame = remover.transform(tokenized)

            # Run the hashing term frequency
            hashing = HashingTF(inputCol="filtered", outputCol="hashedValues")
            # Transform into a DF
            hashed_df = hashing.transform(newFrame)

            # Fit the IDF on the data set
            idf = IDF(inputCol="hashedValues", outputCol="idf_token")
            idfModel = idf.fit(hashed_df)
            rescaledData = idfModel.transform(hashed_df)

            # Create feature vectors
            #idf = IDF(inputCol='hash_token', outputCol='idf_token')
            clean_up = VectorAssembler(inputCols=['idf_token', 'length'],
                                       outputCol='features')
            output = clean_up.transform(rescaledData)

            ei_model = NaiveBayesModel.load("static/models/EI_Predictor.h5")
            sn_model = NaiveBayesModel.load("static/models/SN_Predictor.h5")
            tf_model = NaiveBayesModel.load("static/models/TF_Predictor.h5")
            jp_model = NaiveBayesModel.load("static/models/JP_Predictor.h5")

            test_e = ei_model.transform(output)
            e = test_e.toPandas()["prediction"].values[0]
            if e == 0:
                e_result = "I"
            else:
                e_result = "E"
            test_s = sn_model.transform(output)
            s = test_s.toPandas()["prediction"].values[0]
            if s == 0:
                s_result = "N"
            else:
                s_result = "S"
            test_t = tf_model.transform(output)
            t = test_t.toPandas()["prediction"].values[0]
            if t == 0:
                t_result = "F"
            else:
                t_result = "T"
            test_j = jp_model.transform(output)
            j = test_j.toPandas()["prediction"].values[0]
            if j == 0:
                j_result = "P"
            else:
                j_result = "J"

        else:
            message = "Please tell us more about yourself!"

    return render_template('index.html',
                           message=message,
                           test_e=e_result,
                           test_s=s_result,
                           test_t=t_result,
                           test_j=j_result)
示例#33
0
text = remover.transform(text)
text.show(5)

ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams')
text = ngramer.transform(text)
text.show(5)

count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(),
                            outputCol='ft_features')
count_vec_model = count_vec.fit(text)
vocab = count_vec_model.vocabulary
text = count_vec_model.transform(text)
text.show(5)

idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features')
text = idf.fit(text).transform(text)

lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10)
lda_model = lda.fit(text)

topics = lda_model.describeTopics()
# topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect()
get_topics_words = F.udf(lambda x: [vocab[i] for i in x],
                         ArrayType(StringType()))
topics = topics.withColumn('topic_words',
                           get_topics_words(F.col('termIndices')))
topics.show()

text = lda_model.transform(text)
text.show(5)
'''
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster(
        'spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features",
                                              "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label, title),
    ], ['label', "title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)
    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request, {'resultList': resultList})
spark_df = spark_df.drop('count')
spark_df = spark_df.selectExpr("predicted_category as predicted_category_table", "predict_score as predict_score")

#Tokenizing and Vectorizing
tok = Tokenizer(inputCol="cleaned_hm", outputCol="words")
review_tokenized = tok.transform(spark_train)

stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
review_tokenized = stopword_rm.transform(review_tokenized)

cv = CountVectorizer(inputCol='words_nsw', outputCol='tf')
cvModel = cv.fit(review_tokenized)
count_vectorized = cvModel.transform(review_tokenized)

idf_ngram = IDF().setInputCol('tf').setOutputCol('tfidf')
tfidfModel_ngram = idf_ngram.fit(count_vectorized)
tfidf_df = tfidfModel_ngram.transform(count_vectorized)

word_indexer_pc = StringIndexer(inputCol="predicted_category", outputCol="predicted_category_new", handleInvalid="error")

#Splitting the training data into training data and validation data
splits = tfidf_df.randomSplit([0.8,0.2],seed=100)
train = splits[0]
val = splits[1]

#Building the pipeline for the model
hm_assembler = VectorAssembler(inputCols=[ "tfidf"], outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,labelCol="predicted_category_new",featuresCol = "features")
hm_pipeline = Pipeline(stages=[hm_assembler, word_indexer_pc, lr])

#To get the best paramter values using CrossValidator
示例#36
0
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tf_idf_sample") \
    .master("local[*]") \
    .getOrCreate()

df1 = spark.createDataFrame([(0, "a a a b b c"), (0, "a b c"),
                             (1, "a c a a d")]).toDF("label", "sentence")

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# 각 문장을 단어로 분리
df2 = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="words",
                      outputCol="TF-Features",
                      numFeatures=20)
df3 = hashingTF.transform(df2)

df3.cache()

idf = IDF(inputCol="TF-Features", outputCol="Final-Features")
idfModel = idf.fit(df3)

rescaledData = idfModel.transform(df3)
rescaledData.select("words", "TF-Features", "Final-Features").show()

spark.stop
def add_tfidf(df):
    idf = IDF(inputCol="tf_vector", outputCol="tfidf_vector")
    idf_model = idf.fit(df)
    df_tfidf = idf_model.transform(df)

    return df_tfidf
    def lda_optimal(self,
                    preprocess_file=DEFAULT_PREPROCESSING_OUTPUT,
                    cluster_df=CLUSTER_DF,
                    maxiter=MAXITER,
                    output_file_name=DEFAULT_OUTPUT_FILE,
                    max_term_tagging=m):

        filter_number_udf = udf(
            lambda row: [x for x in row if not self.is_digit(x)],
            ArrayType(StringType()))
        temp = sqlContext.read.parquet(preprocess_file)
        temp = temp.withColumn('no_number_vector_removed',
                               filter_number_udf(col('vector_no_stopw')))
        temp1 = temp.select(temp.paper_id,
                            explode(temp.no_number_vector_removed))
        temp2 = temp1.filter(temp1.col != "")
        temp3 = temp2.groupby("paper_id").agg(
            F.collect_list("col").alias("vector_removed"))
        inner_join = temp3.join(temp, ["paper_id"])
        windowSpec = Window.orderBy(F.col("paper_id"))
        df_final = inner_join.withColumn("id", F.row_number().over(windowSpec))
        df_txts = df_final.select("vector_removed", "id", "paper_id", "doi",
                                  "title", "authors", "abstract",
                                  "abstract_summary", "vector_no_stopw")
        df = sqlContext.read.format("com.databricks.spark.csv").option(
            "header",
            "true").option("inferschema",
                           "true").option("mode",
                                          "DROPMALFORMED").load("CLUSTER_DF")
        df_txts = df.join(df_txts, "paper_id" == "index")

        # TF
        cv = CountVectorizer(inputCol="vector_removed",
                             outputCol="raw_features",
                             vocabSize=5000,
                             minDF=5.0)
        cvmodel = cv.fit(df_txts)
        result_cv = cvmodel.transform(df_txts)
        # IDF
        idf = IDF(inputCol="raw_features", outputCol="features")
        idfModel = idf.fit(result_cv)
        result_tfidf = idfModel.transform(result_cv)

        from pyspark.sql import SparkSession
        from pyspark.sql.types import StructType, StructField, StringType

        spark = SparkSession.builder.appName(
            'SparkByExamples.com').getOrCreate()

        schema = StructType([
            StructField('cluster_id', StringType(), True),
            StructField('tagging', ArrayType(), True)
        ])

        topic_modeling = spark.createDataFrame(spark.sparkContext.emptyRDD(),
                                               schema)

        distinct_clusters = result_tfidf.select(
            "cluster_id").distinct().sorted().collect_list()
        for i in distinct_clusters:
            subset = result_tfidf.filter(result_tfidf.cluster_id == i)
            lda = LDA(k=1, maxIter=100)
            ldaModel = lda.fit(result_subset)
            output = ldaModel.transform(result_tfidf)
            if (i == 0):
                full_df = output
            else:
                full_df = full_df.union(output)
            topics = ldaModel.describeTopics(maxTermsPerTopic=m)
            vocabArray = cvmodel.vocabulary
            ListOfIndexToWords = udf(
                lambda wl: list([vocabArray[w] for w in wl]))
            FormatNumbers = udf(lambda nl: ["{:1.4f}".format(x) for x in nl])

            taggings = topics.select(
                ListOfIndexToWords(topics.termIndices).alias('words'))
            temp = spark.createDataFrame([(i, taggings)],
                                         ['cluster_id', 'taggings'])
            topic_modeling = topic_modeling.union(temp)

        # output the taggings of each topic
        topic_modeling.to_csv(output_file_name)

        return full_df
示例#39
0
文件: analyzer.py 项目: GOVnKOD/KL_KP
def makeWord2VecModel():
    cursor = News.find({})
    text = ""
    for news in cursor:
        text += news['text']
    with open(os.path.join(os.getcwd(), 'word2Vec.txt'), 'w',
              encoding='utf-8') as inputFile:
        inputFile.writelines(text)
    spark = SparkSession.builder.appName("SimpleApplication").getOrCreate()

    # Построчная загрузка файла в RDD
    input_file = spark.sparkContext.textFile('word2Vec.txt')

    print(input_file.collect())
    prepared = input_file.map(lambda x: ([x]))
    df = prepared.toDF()
    prepared_df = df.selectExpr('_1 as text')

    # Разбить на токены
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    # Удалить стоп-слова
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=stop_words)
    filtered = remover.transform(words)

    # Вывести стоп-слова для русского языка
    print(stop_words)

    # Вывести таблицу filtered
    filtered.show()

    # Вывести столбец таблицы words с токенами до удаления стоп-слов
    words.select('words').show(truncate=False, vertical=True)

    # Вывести столбец "filtered" таблицы filtered с токенами после удаления стоп-слов
    filtered.select('filtered').show(truncate=False, vertical=True)

    # Посчитать значения TF
    vectorizer = CountVectorizer(inputCol='filtered',
                                 outputCol='raw_features').fit(filtered)
    featurized_data = vectorizer.transform(filtered)
    featurized_data.cache()
    vocabulary = vectorizer.vocabulary

    # Вывести таблицу со значениями частоты встречаемости термов.
    featurized_data.show()

    # Вывести столбец "raw_features" таблицы featurized_data
    featurized_data.select('raw_features').show(truncate=False, vertical=True)

    # Вывести список термов в словаре
    print(vocabulary)

    # Посчитать значения DF
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_data)
    rescaled_data = idf_model.transform(featurized_data)

    # Вывести таблицу rescaled_data
    rescaled_data.show()

    # Вывести столбец "features" таблицы featurized_data
    rescaled_data.select('features').show(truncate=False, vertical=True)

    # Построить модель Word2Vec
    word2Vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol='words',
                        outputCol='result')
    model = word2Vec.fit(words)
    w2v_df = model.transform(words)
    w2v_df.show()
    persons = []

    cPersons = db.Persones.find({})
    for secName in cPersons:
        persons.append(secName['sName'])

    synonyms = []
    i = 0
    synonyms.append(model.findSynonyms('погибла', 2))

    for word, cosine_distance in synonyms:
        print(str(word))

    spark.stop()
示例#40
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
def main(sc, sqlContext):
    start = timer()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    print '---Pegando produtos---'
    start_i = timer()
    productRDD = sc.parallelize(findProductsByCategory([]))
    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())
    print '####levou %d segundos' % (timer()-start_i)

    print '---Pegando e persistindo dados de categoria e tokens---'
    start_i = timer()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
    numTokens = len(tokens)
    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Calculando TF-IDF dos produtos---'
    start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataDF = sqlContext.createDataFrame(wordsData)   

    #persistindo para a predicao
    wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)   

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")

    wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") 

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
    VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))

    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)    


    print '--Criando modelo Naive Bayes---'
    start_i = timer()
    model = NaiveBayes.train(VSMTrain)

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")

    model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Testando modelo Naive Bayes---'
    start_i = timer()
    prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
    print 'acuracidade de %f' % acuraccy
    print '####levou %d segundos' % (timer()-start_i)    
    
    print '---Pegando os posts---'

    start_i = timer()
    posts = list()
    wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
    sheet = wb['Menes']
    for row in sheet.iter_rows(row_offset=1):
        post = list()
        for cell in row:
            if cell.value is None:
                break
            post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))

        if len(post) > 0:            
            posts.append(tuple(post))

    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    postsRDD = sc.parallelize(posts)
    postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
                           .cache())

    print '####levou %d segundos' % (timer()-start_i)

    print '---Calculando TF-IDF dos Posts---'
    start_i = timer()
    wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1]))
    wordsDataDF = sqlContext.createDataFrame(wordsData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)   

    VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features))
    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)   

    print '--Criando modelo SVM---'
    start_i = timer()
    model = SVMWithSGD.train(VSMTrain, iterations=100)
    
    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm")

    model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")

    print '---Testando modelo SVM---'
    start_i = timer()
    prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features)))
    acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count())
    
    print 'acuracidade de %f' % acuraccy

    print '####levou %d segundos' % (timer()-start_i)   

    print 'O processo todo levou %d segundos' % (timer()-start)
示例#43
0
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tf_idf_sample") \
    .master("local[*]") \
    .getOrCreate()

df1 = spark.createDataFrame([
    (0, "a a a b b c"),
    (0, "a b c"),
    (1, "a c a a d")]).toDF("label", "sentence")

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# 각 문장을 단어로 분리
df2 = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20)
df3 = hashingTF.transform(df2)

df3.cache()

idf = IDF(inputCol="TF-Features", outputCol="Final-Features")
idfModel = idf.fit(df3)

rescaledData = idfModel.transform(df3)
rescaledData.select("words", "TF-Features", "Final-Features").show()

spark.stop
示例#44
0
)
train.filter(F.col('toxic') == 1).show(5)
print(
    '2============================================================================================'
)

tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
wordsData = tokenizer.transform(train)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
tf = hashingTF.transform(wordsData)

tf.select('rawFeatures').take(2)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf)
tfidf = idfModel.transform(tf)

tfidf.select("features").first()

REG = 0.1

lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=REG)

print(
    '5============================================================================================'
)
tfidf.show(5)
print(
    '5============================================================================================'
)
示例#45
0
def main(inputs):

    amazon_schema = types.StructType([
        types.StructField('marketplace', types.StringType()),
        types.StructField('customer_id', types.IntegerType()),
        types.StructField('review_id', types.StringType()),
        types.StructField('product_id', types.StringType()),
        types.StructField('product_parent', types.LongType()),
        types.StructField('product_title', types.StringType()),
        types.StructField('product_category', types.StringType()),
        types.StructField('star_rating', types.IntegerType()),
        types.StructField('helpful_votes', types.IntegerType()),
        types.StructField('total_votes', types.IntegerType()),
        types.StructField('vine', types.StringType()),
        types.StructField('verified_purchase', types.StringType()),
        types.StructField('review_headline', types.StringType()),
        types.StructField('review_body', types.StringType()),
        types.StructField('review_date', types.DateType())
    ])

    input_df = spark.read.parquet(inputs)
    input_df = input_df.repartition(96)
    #input_df.show()
    #print("No of rows in input dataset:",inputs," is:",input_df.count())
    StopWords = stopwords.words("english")
    start_time = time.time()

    tokens = input_df.rdd.map(lambda x: x['review_headline'])\
    .filter(lambda x: x is not None)\
    .map( lambda document: document.strip().lower())\
    .map( lambda document: re.split(" ", document))\
    .map( lambda word: [x for x in word if x.isalpha()])\
    .map( lambda word: [x for x in word if len(x) > 3] )\
    .map( lambda word: [x for x in word if x not in StopWords])\
    .zipWithIndex()

    df_txts = spark.createDataFrame(tokens, ["list_of_words", 'index'])

    # TF
    cv = CountVectorizer(inputCol="list_of_words",
                         outputCol="raw_features",
                         vocabSize=5000,
                         minDF=10.0)
    cvmodel = cv.fit(df_txts)
    result_cv = cvmodel.transform(df_txts)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv)

    #result_tfidf.show()

    num_topics = 10
    max_iterations = 100
    lda = LDA(k=num_topics, maxIter=max_iterations)
    lda_model = lda.fit(result_tfidf.select('index', 'features'))

    wordNumbers = 5
    #topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers))

    topics = lda_model.describeTopics(maxTermsPerTopic=wordNumbers)
    topics.show(truncate=False)
示例#46
0
文件: spark.py 项目: mosh98/spark
cv = CountVectorizer(inputCol='words_filtered', outputCol='BoW', minDF=2.0)
cv_model = cv.fit(train_filtered)
train_data = cv_model.transform(train_filtered)
dev_data = cv_model.transform(dev_filtered)
test_data = cv_model.transform(test_filtered)

# TODO: Print the vocabulary size (to STDOUT) after filtering out stopwords and very rare tokens
# Hint: Look at the parameters of CountVectorizer
# [FIX ME!] Write code below
print("length of all the vocabulary after stopwrods = ", len(cv_model.vocabulary))



# Create a TF-IDF representation of the data
idf = IDF(inputCol='BoW', outputCol='TFIDF')
idf_model = idf.fit(train_data)
train_tfidf = idf_model.transform(train_data)
dev_tfidf = idf_model.transform(dev_data)
test_tfidf = idf_model.transform(test_data)

# ----- PART III: MODEL SELECTION -----

# Provide information about class labels: needed for model fitting
# Only needs to be defined once for all models (but included in all pipelines)
label_indexer = StringIndexer(inputCol='class_label', outputCol='label')

# Create an evaluator for binary classification
# Only needs to be created once, can be reused for all evaluation
evaluator = BinaryClassificationEvaluator()

# Train a decision tree with default parameters (including maxDepth=5)
udf_cleansing_and_tokenizing = functions.udf(cleansing_and_tokenizing)
test_data = test_data.withColumn("tweet_cleansed", udf_cleansing_and_tokenizing(functions.col("tweet")))

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
test_data = tokenizer.transform(test_data)

from pyspark.ml.feature import HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="term_freq")
test_data = hashingTF.transform(test_data)
test_data.show(5)

from pyspark.ml.feature import IDF 
idf = IDF(inputCol="term_freq", outputCol="tfidf")
idfModel = idf.fit(test_data)
test_data = idfModel.transform(test_data)
test_data.show(5)

from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex")
model = stringIndexer.fit(test_data)
test_data = model.transform(test_data)
test_data.show(5)

predicted = test_data.select("tfidf", "labelIndex")
predicted.show(5)

model_folder = os.path.join(os.getcwd(), 'saved_models')
model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
if not os.path.exists(model_folder):
def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()



#**********************************************************************
#-----------Training the model for prediction--------------------------
#**********************************************************************