示例#1
0
def tfidf_vectors(input_col: str, output_col: str, df: DataFrame, num_features=262144):
    """Calculate the tfidf vectors for the given input tokens"""
    # the invese document frequency can be calculated using
    # only the term frequency, essentially it is a column wise
    # operation over every term in the corpus
    idf = IDF(minDocFreq=0, inputCol=input_col, outputCol=output_col).fit(df)
    return idf.transform(df)
示例#2
0
    def get_product_similarity(self):
        """
        Calculate the similarity between items/users
        """
        product_taxonomy = self.data.select(self.productCol,
                                            self.taxonomyCol).distinct()
        product_taxonomy = self.__data_manipulation(product_taxonomy)

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(product_taxonomy)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        col1 = "i." + self.productCol
        col2 = "j." + self.productCol

        dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType())
        result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\
            .select(
                col(col1).alias("i"),
                col(col2).alias("j"),
                dot_udf("i.norm", "j.norm").alias("dot"))\
            .sort("i", "j")

        result = result.filter(result.i < result.j & result.dot > 0.5)

        return result
示例#3
0
    def __data_manipulation(self, col):

        data = self.data.select(col, self.taxonomyCol).distinct()
        data = data.withColumn(self.taxonomyCol,
                               data[self.taxonomyCol].cast(StringType()))

        concat_list = udf(lambda lst: ", ".join(lst), StringType())
        data = data.groupby(col).agg(
            collect_list(self.taxonomyCol).alias(self.taxonomyCol))

        data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol))
        data = data.withColumn(
            self.taxonomyCol,
            split(regexp_replace(self.taxonomyCol, " ", ""), ','))

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(data)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        return norma_data
示例#4
0
def tf_idf(data_rdd):
    """
    Calculate term frequency–inverse document frequency for reflecting importance of words in Tweet.
    :param data_rdd: input data rdd
    :return: transformed dataframe
    """
    data_rdd_df = data_rdd.toDF()
    hashing_tf = HashingTF(inputCol="words", outputCol="tf_features")
    tf_data = hashing_tf.transform(data_rdd_df)

    idf_data = IDF(inputCol="tf_features", outputCol="features").fit(tf_data)
    tf_idf_data = idf_data.transform(tf_data)
    return tf_idf_data.select(["label", "words", "features"])
示例#5
0
#Performing Tokenization for the data
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
wordsData.show()

"""# **2.a) Performing the task without NLP**"""

# applying tf on the words data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
tf = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
# calculating the IDF
tf.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf = idf.fit(tf)
tfidf = idf.transform(tf)
#displaying the results
tfidf.select("label", "features").show()


print("TF-IDF without NLP:")
for each in tfidf.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

"""# **2.b) Performing the task with Lemmatization**"""

import nltk;nltk.download('punkt');nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
示例#6
0
text_train = sc.textFile(input_file_train)

pure_text_train = text_train.filter(deleteFirstRow)
genre_and_sentences_after_flatmap = pure_text_train.flatMap(extractGenreAndSentencesForFlatmap)
genre_and_sentences_after_flatmap.persist()

# TFIDF
tfidf_dataFrame = genre_and_sentences_after_flatmap.toDF(["genre","sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tfidf_words_data = tokenizer.transform(tfidf_dataFrame)

hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=512)
tfidf_featurized_data = hashing_tf.transform(tfidf_words_data)

idf_model = IDF(inputCol="rawFeatures", outputCol="features").fit(tfidf_featurized_data)
tfidf_rescaled_data = idf_model.transform(tfidf_featurized_data)
tfidf_genre_features = tfidf_rescaled_data.select("genre", "features")

# Confusion matrix for TFIDF
tfidf_kmeansmodel = KMeans().setK(5).setFeaturesCol('features').setPredictionCol('prediction').fit(tfidf_genre_features)
tfidf_predictions = tfidf_kmeansmodel.transform(tfidf_genre_features).select("prediction", "genre")
tfidf_res = tfidf_predictions.groupBy(['prediction', 'genre']).count().collect()
print("Confusion matrix for TFIDF:")
toPrint(tfidf_res)
print()

#######################################################################
## Vocabulary Exploration - Part B                                   ##
#######################################################################

# pretrained
示例#7
0
    input_rdd = sc.textFile(train_path).map(split_train)
    train_hive_info = hiveCtx.createDataFrame(input_rdd, ['label', 'text'])
    split = Tokenizer(inputCol="text", outputCol="words")
    wordsData = split.transform(train_hive_info)
    my_print('分词完成.......')

    # 增加TF特征列
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=2**10)
    TF_data = hashingTF.transform(wordsData)
    my_print('TF特征构造完成.......')

    # 增加IDF特征列
    idf = IDF(inputCol="rawFeatures", outputCol="features").fit(TF_data)
    final_input_data = idf.transform(TF_data)
    my_print('IDF特征构造完成.......')

    train_rdd = final_input_data.select("label", "features") \
        .rdd.map(lambda (label, features): (LabeledPoint(label, features.toArray())))

    if model_name == 'LogisticRegression':
        model = LogisticRegressionWithLBFGS.train(train_rdd, numClasses=10)
        model.save(sc, model_path)

    elif model_name == 'NaiveBayes':
        model = NaiveBayes.train(train_rdd)
        model.save(sc, model_path)

    else:
        model = RandomForest.trainClassifier(train_rdd, 10, {}, 10, seed=42)
示例#8
0
class BM25Model(object):
    """
    Computes BM25 score.
    """
    def __init__(self, k=1.2, b=.75):
        self.k = k
        self.b = b
        self.tok = Tokenizer(inputCol='__input', outputCol='__tokens')
        self.vec = CountVectorizer(inputCol='__tokens', outputCol='__counts')
        self.idf = IDF(inputCol='__counts', outputCol='__idf')
        self.train_col = None
        self.udf = None
        self.is_fit = False

    def fit(self, df, train_col):
        """
        Does fitting on input df.
            df: a pyspark dataframe.
            train_col (string): The name of the column containing training documents.
            
        Returns: self, a 
        """
        self.train_col = train_col
        df_ = self.tok.transform(df.withColumnRenamed(train_col, '__input'))
        mean_dl = df_.select(F.mean(F.size(F.col('__tokens')))).collect()[0][0]
        self.vec = self.vec.fit(df_)
        df_ = self.vec.transform(df_)
        self.idf = self.idf.fit(df_)
        #this will reset value of self.udf to be a working udf function.
        exec(udf_template.format(mean_dl, self.k, self.b))
        self.is_fit = True
        return self

    def transform(self,
                  df,
                  score_col,
                  bm25_output_name='bm25',
                  tf_output_name=None,
                  ntf_output_name=None,
                  tfidf_output_name=None):
        """
        Computes BM25 score, 
            along with normalized term frequency (ntf) and tfidf.
            These three additional scores come "for free" with bm25
            but are only returned optionally.
        """
        if not self.is_fit:
            raise Exception(
                "You must fit the BM25 model with a call to .fit() first.")
        columns = df.columns
        df_ = self.tok.transform(df.withColumnRenamed(score_col, '__input'))
        df_ = self.vec.transform(df_)
        df_ = self.idf.transform(df_)
        df_ = (df_.withColumnRenamed(
            '__counts', '__query_counts').withColumnRenamed(
                '__input',
                score_col)).select(columns +
                                   [score_col, '__query_counts', '__idf'])
        df_ = self.tok.transform(
            df_.withColumnRenamed(self.train_col, '__input'))
        df_ = self.vec.transform(df_)
        df_ = df_.withColumnRenamed('__counts', '__item_counts')
        df_ = df_.withColumn(
            'bm25',
            self.udf(F.col('__query_counts'), F.col('__item_counts'),
                     F.col('__idf')))
        df_ = df_.withColumnRenamed('__input', self.train_col)
        computed_values = df_.withColumn(
            'more',
            F.explode(F.array(F.col('bm25')))).select(columns + ['bm25.*'])

        #this is logic for naming output column(s)
        final_selection = columns
        if bm25_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'bm25', bm25_output_name)
            final_selection.append(bm25_output_name)
        if tf_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'tf', tf_output_name)
            final_selection.append(tf_output_name)
        if ntf_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'ntf', ntf_output_name)
            final_selection.append(ntf_output_name)
        if tfidf_output_name is not None:
            computed_values = computed_values.withColumnRenamed(
                'tfidf', tfidf_output_name)
            final_selection.append(tfidf_output_name)

        return computed_values.select(final_selection)
    
##creating rdd file
sc = SparkContext("local", "app")
sqc = SQLContext(sc)
df = sqc.createDataFrame(data, ['type', 'text'])

#NEW VARIABLE GENERATION
dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text'])))
dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1]))
dfClean = sqc.createDataFrame(dataClean, ['label', 'words'])
dfClean.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000)
tf = hashingTF.transform(dfClean)
idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf)
dfFinal = idf.transform(tf)

# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal)

# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = dfFinal.randomSplit([0.8, 0.2])


# Train the model.
#rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures")
示例#10
0
#df0.show()
print('The number of jobs:',df0.count())
print('\nthe distinct jobs name: ', df1.job.unique())
print('\nThere are', len(df1.job.unique())-1, 'different kinds of jobs in the table.')

# split the desc field
tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words')
df = tokenizer.transform(df0)
#df.show()
#df.select('desc_words').show(10)

# compute TF-IDF
hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf')
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf)
tfidf = idf.transform(tf).cache()
#print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False))

# data normalization
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
#tfidf.select("id", "norm").show(6)

# compute similarity between jobs and resume
import pyspark.sql.functions as psf 
from pyspark.sql.types import DoubleType
print('\nCompute the similarity between jobs and resume...')
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType()) # define dot-product function
tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") == 0)\
        .select(
示例#11
0
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
wordsData.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(10)
featurizedData.printSchema()

featurizedData.cache()
#idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = IDF(inputCol="rawFeatures",
               outputCol="features").fit(featurizedData)
#idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

dataset = rescaledData.select("features")
from pyspark.ml.clustering import KMeans
# Trains a k-means model.
kmeans = KMeans().setK(10).setSeed(1)
model = kmeans.fit(dataset)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
示例#12
0
df = spark.read.format("csv").option("inferschema", "true").option(
    "header", "true").option("delimiter", "\t").load("trainReviews.tsv")

tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(df)
wordsData.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
tf = hashingTF.transform(wordsData)
tf.show(10)

tf.head().rawFeatures

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

ml = LogisticRegression(featuresCol="features",
                        labelCol='category',
                        regParam=0.01)
mlModel = ml.fit(tfidf.limit(5000))
res_train = mlModel.transform(tfidf)
extract_prob = F.udf(lambda x: float(x[1]), T.FloatType())
res_train.withColumn("proba", extract_prob("probability")).select(
    "id", "proba", "prediction").show()

test_df = spark.read.format("csv").option("inferschema", "true").option(
    "header", "true").option("delimiter", "\t").load("testReviews.tsv")

tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(test_df)