示例#1
0
    def defineNB_model():
        VOCAB_SIZE = 20000
        MINDF = 3
        TRAINING_ITERS = 150

        tokenizer = pipeline_utils.TweetTokenizer(inputCol='text',
                                                  outputCol='words')
        stopword_remover = ml.feature.StopWordsRemover(
            inputCol=tokenizer.getOutputCol(),
            outputCol='filtered',
            stopWords=list(all_stopwords))
        stemmer = pipeline_utils.Stemmer(
            inputCol=stopword_remover.getOutputCol(),
            outputCol='cleaned_words')
        counter = ml.feature.CountVectorizer(inputCol=stemmer.getOutputCol(),
                                             outputCol='features',
                                             vocabSize=VOCAB_SIZE,
                                             minDF=MINDF)

        classifier = ml.classification.NaiveBayes(smoothing=1.0,
                                                  modelType='multinomial')
        pipeline = ml.Pipeline(
            stages=[tokenizer, stopword_remover, stemmer, counter, classifier])

        return pipeline, classifier
示例#2
0
def tfidf_review_text(df):
    with Timer("TF-IDF for reviewText"):
        df = df.select(["reviewText"]).dropna()

    with Timer("TF-IDF pipeline"):
        tokenizer = ml.feature.Tokenizer(inputCol="reviewText",
                                         outputCol="token")
        cv = ml.feature.CountVectorizer(inputCol="token", outputCol="hash")
        idf = ml.feature.IDF(inputCol="hash", outputCol="tfidf")
        pipeline = ml.Pipeline(stages=[tokenizer, cv, idf])
        model = pipeline.fit(df)
        df = model.transform(df)
        df.unpersist()
        # df.cache()

    stages = model.stages
    # print(f"stages: {stages}")

    vectorizers = [s for s in stages if isinstance(s, CountVectorizerModel)]
    vocab = [v.vocabulary for v in vectorizers]
    vocab = vocab[0]
    # print(f"Length of Vocab: {len(vocab[0])}")

    idx2word = {idx: word for idx, word in enumerate(vocab)}

    with Timer("Convert TF-IDF sparseVector to (word:value dict)"):
        my_udf_func = udf(lambda vector: sparse2dict(vector, idx2word),
                          types.StringType())
        df = df.select("reviewText", my_udf_func("tfidf").alias("tfidf"))
    return df
示例#3
0
    def define_combo_model():
        VOCAB_SIZE = 10000
        MINDF = 3
        TRAINING_ITERS = 150

        tokenizer = pipeline_utils.TweetTokenizer(inputCol='text',
                                                  outputCol='words')
        stemmer = pipeline_utils.Stemmer(inputCol=tokenizer.getOutputCol(),
                                         outputCol='cleaned_words')

        stopword_remover = ml.feature.StopWordsRemover(
            inputCol=stemmer.getOutputCol(),
            outputCol='stopwords_removed',
            stopWords=list(all_stopwords))
        unigram_ifidf = TFIDF_pipeline('unigram',
                                       stopword_remover.getOutputCol(), 15000)

        ngrammer = ml.feature.NGram(n=3,
                                    inputCol=stemmer.getOutputCol(),
                                    outputCol='trigrams')
        ngram_ifidf = TFIDF_pipeline('trigram', ngrammer.getOutputCol(), 5000)

        assembler = ml.feature.VectorAssembler(
            inputCols=['unigram_features', 'trigram_features'],
            outputCol='features')

        regresser = ml.classification.LogisticRegression(
            maxIter=TRAINING_ITERS, featuresCol='features', labelCol='label')
        pipeline = ml.Pipeline(stages=[
            tokenizer, stemmer, stopword_remover, unigram_ifidf, ngrammer,
            ngram_ifidf, assembler, regresser
        ])

        return pipeline, regresser
示例#4
0
    def define_bigram_model():
        VOCAB_SIZE = 20000
        MINDF = 3
        TRAINING_ITERS = 150

        tokenizer = pipeline_utils.TweetTokenizer(inputCol='text',
                                                  outputCol='words')
        #stopword_remover = ml.feature.StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='filtered', stopWords=list(all_stopwords))
        stemmer = pipeline_utils.Stemmer(inputCol=tokenizer.getOutputCol(),
                                         outputCol='cleaned_words')
        ngrammer = ml.feature.NGram(n=3,
                                    inputCol=stemmer.getOutputCol(),
                                    outputCol='ngrams')
        counter = ml.feature.CountVectorizer(inputCol=ngrammer.getOutputCol(),
                                             outputCol='counts',
                                             vocabSize=VOCAB_SIZE,
                                             minDF=MINDF)
        normalizer = ml.feature.Normalizer(p=1.0,
                                           inputCol=counter.getOutputCol(),
                                           outputCol='tf_normalized')
        df_normalize = ml.feature.IDF(inputCol=normalizer.getOutputCol(),
                                      outputCol='features')
        regresser = ml.classification.LogisticRegression(
            maxIter=TRAINING_ITERS, featuresCol='features', labelCol='label')
        pipeline = ml.Pipeline(stages=[
            tokenizer, stemmer, ngrammer, counter, normalizer, df_normalize,
            regresser
        ])

        return pipeline, regresser
示例#5
0
 def TFIDF_pipeline(prefix, inputCol, vocab_size, min_df=3):
     counter = ml.feature.CountVectorizer(inputCol=inputCol,
                                          outputCol=prefix + '_counts',
                                          vocabSize=vocab_size,
                                          minDF=min_df)
     normalizer = ml.feature.Normalizer(p=1.0,
                                        inputCol=counter.getOutputCol(),
                                        outputCol=prefix + '_tf_normalized')
     df_normalize = ml.feature.IDF(inputCol=normalizer.getOutputCol(),
                                   outputCol=prefix + '_features')
     return ml.Pipeline(stages=[counter, normalizer, df_normalize])
def run(sc, args):
    sc.setLogLevel('FATAL')
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('year',
                            help='Year of prediction, in format YYYY.',
                            type=int)
    arg_parser.add_argument('month',
                            help='Month of prediction, in format MM.',
                            type=int)
    arg_parser.add_argument('day',
                            help='Day of prediction, in format DD.',
                            type=int)
    args = arg_parser.parse_args(args)

    ss = sql.SparkSession(sc)

    latlongrid = grid.LatLonGrid(lat_min=40.488320,
                                 lat_max=40.957189,
                                 lon_min=-74.290739,
                                 lon_max=-73.635679,
                                 lat_step=grid.get_lon_delta(
                                     1000, (40.957189 - 40.488320) / 2.0),
                                 lon_step=grid.get_lat_delta(1000))

    tweets_df = import_twitter_data(ss, 'tweets2.csv')

    prediction_date = datetime.date(args.year, args.month, args.day)
    NUM_DAYS_IN_HISTORY = 31
    history_cutoff = prediction_date - datetime.timedelta(
        days=NUM_DAYS_IN_HISTORY)
    filtered_tweets_df = filter_by_dates(ss, tweets_df, history_cutoff,
                                         prediction_date)

    tokens_df = group_by_grid_square_and_tokenize(ss, latlongrid,
                                                  filtered_tweets_df)

    hashing_tf = feature.HashingTF(numFeatures=(2 ^ 18) - 1,
                                   inputCol='tokens',
                                   outputCol='token_frequencies')
    lda = (clustering.LDA().setFeaturesCol('token_frequencies').setK(
        10).setTopicDistributionCol('topic_distribution'))
    topic_distribution_pipeline = ml.Pipeline(stages=[hashing_tf, lda])
    lda_model = topic_distribution_pipeline.fit(tokens_df)
    topic_distributions = (lda_model.transform(tokens_df).select(
        ['grid_square', 'topic_distribution']))

    complaints_df = load_filter_format_valid_complaints(
        ss, 'crime_complaints_with_header.csv')

    complaints_df.show()
示例#7
0
    def LSA_XGBoost_model():

        VOCAB_SIZE = 20000
        MINDF = 3
        TRAINING_ITERS = 150

        tokenizer = pipeline_utils.TweetTokenizer(inputCol='text',
                                                  outputCol='words')
        stemmer = pipeline_utils.Stemmer(inputCol=tokenizer.getOutputCol(),
                                         outputCol='cleaned_words')

        stopword_remover = ml.feature.StopWordsRemover(
            inputCol=stemmer.getOutputCol(),
            outputCol='stopwords_removed',
            stopWords=list(all_stopwords))
        unigram_ifidf = TFIDF_pipeline('unigram',
                                       stopword_remover.getOutputCol(), 10000)

        ngrammer = ml.feature.NGram(n=1,
                                    inputCol=stemmer.getOutputCol(),
                                    outputCol='trigrams')
        ngram_ifidf = TFIDF_pipeline('trigram', ngrammer.getOutputCol(), 5000)

        assembler = ml.feature.VectorAssembler(
            inputCols=['unigram_features', 'trigram_features'],
            outputCol='features')

        pca = ml.feature.PCA(inputCol=assembler.getOutputCol(),
                             k=250,
                             outputCol='lsa_features')

        scaler = ml.feature.StandardScaler(inputCol=pca.getOutputCol(),
                                           outputCol='features',
                                           withMean=True)

        classifier = ml.classification.GBTClassifier(
            subsamplingRate=0.5, featureSubsetStrategy='auto')

        pipeline = ml.Pipeline(stages=[
            tokenizer, stemmer, stopword_remover, unigram_ifidf, ngrammer,
            ngram_ifidf, assembler, pca, scaler, classifier
        ])

        return pipeline, pca
示例#8
0
    def define_unigram_model():
        VOCAB_SIZE = 20000
        MINDF = 3
        TRAINING_ITERS = 150

        tokenizer = pipeline_utils.TweetTokenizer(inputCol='text',
                                                  outputCol='words')
        stopword_remover = ml.feature.StopWordsRemover(
            inputCol=tokenizer.getOutputCol(),
            outputCol='filtered',
            stopWords=list(all_stopwords))
        stemmer = pipeline_utils.Stemmer(
            inputCol=stopword_remover.getOutputCol(),
            outputCol='cleaned_words')
        counter = ml.feature.CountVectorizer(inputCol=stemmer.getOutputCol(),
                                             outputCol='counts',
                                             vocabSize=VOCAB_SIZE,
                                             minDF=MINDF)
        normalizer = ml.feature.Normalizer(p=1.0,
                                           inputCol=counter.getOutputCol(),
                                           outputCol='tf_normalized')
        df_normalize = ml.feature.IDF(inputCol=normalizer.getOutputCol(),
                                      outputCol='features')
        regresser = ml.classification.LogisticRegression(
            maxIter=TRAINING_ITERS,
            regParam=0.01,
            elasticNetParam=0.5,
            featuresCol='features',
            labelCol='label')
        #regresser = ml.classification.MultilayerPerceptronClassifier(maxIter=TRAINING_ITERS, layers = [3,2,1], blockSize=64,seed=1234)
        pipeline = ml.Pipeline(stages=[
            tokenizer, stopword_remover, stemmer, counter, normalizer,
            df_normalize, regresser
        ])

        return pipeline, regresser
示例#9
0
def tfidf_review_text(df):
    with Timer("TF-IDF for reviewText"):
        df = df.select(["reviewText"]).dropna()

        with Timer("TF-IDF pipeline"):
            tokenizer = ml.feature.Tokenizer(inputCol="reviewText",
                                             outputCol="token")
            hasher = ml.feature.CountVectorizer(inputCol="token",
                                                outputCol="hash")
            idf = ml.feature.IDF(inputCol="hash", outputCol="tfidf")
            pipeline = ml.Pipeline(stages=[tokenizer, hasher, idf])
            pipeline = pipeline.fit(df)
            df = pipeline.transform(df)

        vocab = pipeline.stages[1].vocabulary
        print("Vectorizer vocab size:", len(vocab))
        idx2word = {idx: word for idx, word in enumerate(vocab)}

        with Timer("Convert TF-IDF sparseVector to str(word:value dict)"):
            my_udf = udf(lambda vec: sparse2dict(vec, idx2word),
                         types.StringType())
            df = df.select("reviewText", my_udf("tfidf").alias("tfidf_final"))
        show_df(df, 10)
        return df
示例#10
0
df_labeled = df_labeled.na.drop().drop("version_idx")
cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names

#pipline stages
#index the label
labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx")
#vectorise the input
toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features")
#classify
classifier = DecisionTreeClassifier(labelCol="Label_idx",
                                    featuresCol="Features",
                                    maxDepth=10,
                                    maxBins=200)

#create pipline of the stages and use it to train and test
pipeline = ml.Pipeline(stages=[labelIndexer, toVec, classifier])
train, test = df_labeled.randomSplit([0.7, 0.3], seed=12345)
df_pip = pipeline.fit(train)
predicted = df_pip.transform(test)
#print result
predicted.select("name", "Label_idx", "prediction", "rawPrediction",
                 "probability").show(30, False)


#function to evaluate result
def evaluate(method, predicted):
    evaluator_acc = MulticlassClassificationEvaluator(
        labelCol="Label_idx", predictionCol="prediction", metricName=method)
    accuracy = evaluator_acc.evaluate(predicted)
    return accuracy
tokens_rdd = (tweets_df.rdd.map(row_to_gridsquare_tokens).reduceByKey(
    operator.concat))

tokens_df_schema = types.StructType([
    types.StructField('grid_square', types.IntegerType()),
    types.StructField('tokens', types.ArrayType(types.StringType()))
])
tokens_df = ss.createDataFrame(tokens_rdd, schema=tokens_df_schema)

hashing_tf = feature.HashingTF(numFeatures=(2 ^ 18) - 1,
                               inputCol='tokens',
                               outputCol='token_frequencies')
lda = (clustering.LDA().setFeaturesCol('token_frequencies').setK(
    10).setTopicDistributionCol('topic_distribution'))
topic_distribution_pipeline = ml.Pipeline(stages=[hashing_tf, lda])
lda_model = topic_distribution_pipeline.fit(tokens_df)
topic_distributions = lda_model.transform(tokens_df).select(
    ['grid_square', 'topic_distribution'])

# --------------------------------------------------------------------------------------------------
# PART 2: Get complaint counts per (grid square, date).
# --------------------------------------------------------------------------------------------------

complaints_df_schema = types.StructType([
    types.StructField('CMPLNT_NUM', types.IntegerType(), nullable=False),
    types.StructField('CMPLNT_FR_DT', types.StringType()),
    types.StructField('CMPLNT_FR_TM', types.StringType()),
    types.StructField('CMPLNT_TO_DT', types.StringType()),
    types.StructField('CMPLNT_TO_TM', types.StringType()),
    types.StructField('RPT_DT', types.StringType(), nullable=False),
示例#12
0
                            withMean=False,
                            inputCol='features',
                            outputCol='scaledFeatures')

# Use PCA to reduce dimensionality of scaled vectors
reducer = smf.PCA(k=10,
                  inputCol=scaler.getOutputCol(),
                  outputCol='selectedFeatures')

# Use a classifier to generate the final predictions
classifier = smc.GBTClassifier(labelCol='label',
                               featuresCol=reducer.getOutputCol(),
                               predictionCol='predictedLabel')

# Combine all steps in a pipeline
pipeline = sm.Pipeline(stages=[scaler, reducer, classifier])

# Create an evaluator which will quantify model performance
# evaluator = sme.BinaryClassificationEvaluator(
#     labelCol='label',
#     rawPredictionCol='predictedLabel',
#     metricName='areaUnderROC'
# )
eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label',
                                                predictionCol='predictedLabel',
                                                metricName='f1')

# Set up a parameter grid for cross validation
param_grid = smt.ParamGridBuilder().addGrid(
    reducer.k,
    [10, 20, 50, 75]).addGrid(classifier.maxDepth,