示例#1
0
    selected_tags_df = tags_df.filter(tags_df.Tag.isin(
        tags_set.value)).na.drop(how='any')
    tags_questions_df = selected_tags_df.join(posts_df, "Id")
    training_df = tags_questions_df.select(['Tag', 'Body',
                                            'Id']).na.drop(how='any')
    logger.debug("successfully get training_df")

    # tokenize post texts and get term frequency and inverted document frequency
    logger.debug("Start to generate TFIDF features")
    tokenizer = Tokenizer(inputCol="Body", outputCol="Words")
    tokenized_words = tokenizer.transform(training_df.na.drop(how='any'))
    tokenizer.save(tokenizer_file)
    hashing_TF = HashingTF(inputCol="Words",
                           outputCol="Features",
                           numFeatures=20000)  #, numFeatures=200
    hashing_TF.save(hashing_tf_file)
    TFfeatures = hashing_TF.transform(tokenized_words.na.drop(how='any'))

    idf = IDF(inputCol="Features", outputCol="IDF_features")
    idfModel = idf.fit(TFfeatures.na.drop())
    idfModel.save(idf_model_file)
    TFIDFfeatures = idfModel.transform(TFfeatures.na.drop(how='any'))
    logger.debug("Get TFIDF features successfully")

    # for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3):
    # 	logger.info(feature) =

    # register shutdown_hook
    atexit.register(shutdown_hook, spark_session=spark)

    # Row(IDF_features=SparseVector(200, {7: 2.3773, 9: 2.1588, 32: 2.0067, 37: 1.7143, 49: 2.6727, 59: 2.9361, 114: 1.0654, 145: 2.9522, 167: 2.3751}), Tag=u'asp.net')