def normalize(self):
        from pyspark.ml.feature import Normalizer
        from pyspark.ml.linalg import Vectors

        df = self.session.createDataFrame([(0, [1.0, 0.5, -1.0]),
                                           (1, [2.0, 1.0, 1.0]),
                                           (2, [4.0, 10.0, 2.0])],
                                          ["id", "features"])

        # Vector概念解释
        @F.udf(returnType=VectorUDT())
        def vectorize_from_array(a):
            return Vectors.dense(a)

        df = df.withColumn("features", vectorize_from_array(F.col("features")))
        # Normalize each Vector using $L^1$ norm.
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        l1NormData = normalizer.transform(df)
        print("Normalized using L^1 norm")
        l1NormData.show()

        # Normalize each Vector using $L^\infty$ norm.
        lInfNormData = normalizer.transform(df, {normalizer.p: float("inf")})
        print("Normalized using L^inf norm")
        lInfNormData.show()
def get_sim(tfidf, threshold, save_dir):
    normalizer = Normalizer(inputCol='features', outputCol='norm')
    data = normalizer.transform(tfidf)
    dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType())
    sim_df = (
        data.alias('i').join(data.alias('j'),
                             col('i.id') < col('j.id')).select(
                                 col('i.id').alias('i'),
                                 col('j.id').alias('j'),
                                 dot_udf('i.norm',
                                         'j.norm').alias('similarity'))
        # .sort('i', 'j')
    )
    sim_df_filtered = sim_df.filter(col('similarity').between(threshold, 1.0))

    edges = [(row.i, row.j, row.similarity)
             for row in sim_df_filtered.collect()]
    print('Edges: {}'.format(len(edges)))
    vertices = set()
    for e in edges:
        vertices.add(e[0])
        vertices.add(e[1])
    vertices = [(v, ) for v in list(vertices)]
    doc_sim = {'edges': edges, 'vertices': vertices}

    pkl.dump(doc_sim, open(os.path.join(save_dir, 'doc_sim.pkl'), 'wb'))
Пример #3
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
Пример #4
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    VECTOR_SIZE = 50

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache()
    df_jobs.registerTempTable("jobs")
    df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache()
    df_cvs.registerTempTable("cvs")
    df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache()
    df_categories.registerTempTable("categories")

    joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \
               SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \
               SELECT skillText AS text, id AS id, 'categories' AS type FROM categories")

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenized = tokenizer.transform(joined)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    word2Vec = Word2Vec(vectorSize=VECTOR_SIZE, minCount=0, inputCol="filtered", outputCol="vectors")
    model = word2Vec.fit(removed)
    resultDF = model.transform(removed)

    normalizer = Normalizer(inputCol="vectors", outputCol="result", p=2)
    l1NormData = normalizer.transform(resultDF)

    l1NormData.registerTempTable("resultTable")
    jobs = spark.sql("SELECT result AS jobsVec, id AS jobId FROM resultTable WHERE type = 'job'")
    cvs = spark.sql("SELECT result AS cvsVec, id AS cvid FROM resultTable WHERE type = 'cv'")
    categories = spark.sql("SELECT result AS categoriesVec, cat.id, cat.skillName, category FROM resultTable AS rt\
    LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'")

    #Calculate job-cv similarity START
    crossJoined_job_cv = jobs.crossJoin(cvs)
    calculated_job_cv = crossJoined_job_cv.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.jobsVec, x.cvsVec)))\
    .toDF(["jobid", "cvid", "distance"]).orderBy(asc("jobid")).coalesce(2)
    calculated_job_cv.write.csv('Calculated/word2vec2/job-cv')
    #Calculate job-cv similarity END

    #Calculate cv-category similarity START
    crossJoined_cv_cat = cvs.crossJoin(categories)
    calculated_cv_cat = crossJoined_cv_cat.rdd.map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.cvsVec, x.categoriesVec)))\
    .toDF(["cvid", "category_id", "skillName", "category", "distance"]).orderBy(asc("cvid"), asc("distance")).coalesce(2)
    calculated_cv_cat.write.csv('Calculated/word2vec2/cv-category')
    #Calculate cv-category similarity END

    #Job-category START
    crossJoined_job_cat = jobs.select("jobId", "jobsVec").crossJoin(categories.select("id", "skillName", "category", "categoriesVec"))
    calculatedDF_job_cat = crossJoined_job_cat.rdd\
    .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.jobsVec, x.categoriesVec)))\
    .toDF(["jobid", "catid", "skillName", "category", "distance"])
    ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2)
    ordered_job_cat.write.csv('Calculated/word2vec2/job-category')
Пример #5
0
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0):

    tokenized = Tokenizer(inputCol="text",
                          outputCol="words").transoform(sentenceDataFrame)

    ngramDataFrame = NGram(n=ngrams, inputCol="words",
                           outputCol="ngrams").transform(tokenized)

    countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures")

    countVectModel = countVect.fit(ngramDataFrame)

    featurizedData = countVectModel.transform(ngramDataFrame)

    idf = IDF(minDocFreq=minDocFreq,
              inputCol="rawFeatures",
              outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features")

    normalizer = Normalizer(inputCol="features", outputCol='scores')
    X = normalizer.transform(rescaledData)

    return X
Пример #6
0
    def test_model_normalizer_2(self):
        data = self.spark.createDataFrame([(0, Vectors.dense(1.0, 0.5, -1.0)),
                                           (1, Vectors.dense(2.0, 1.0, 1.0)),
                                           (2, Vectors.dense(4.0, 10.0, 2.0))
                                           ]).toDF("id", "features")
        model = Normalizer(inputCol='features',
                           outputCol='norm_feature',
                           p=2.0)

        model_onnx = convert_sparkml(model, 'Sparkml Normalizer',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().norm_feature.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlNormalizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['norm_feature'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Пример #7
0
    def get_product_similarity(self):
        """
        Calculate the similarity between items/users
        """
        product_taxonomy = self.data.select(self.productCol,
                                            self.taxonomyCol).distinct()
        product_taxonomy = self.__data_manipulation(product_taxonomy)

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(product_taxonomy)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        col1 = "i." + self.productCol
        col2 = "j." + self.productCol

        dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType())
        result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\
            .select(
                col(col1).alias("i"),
                col(col2).alias("j"),
                dot_udf("i.norm", "j.norm").alias("dot"))\
            .sort("i", "j")

        result = result.filter(result.i < result.j & result.dot > 0.5)

        return result
Пример #8
0
    def __data_manipulation(self, col):

        data = self.data.select(col, self.taxonomyCol).distinct()
        data = data.withColumn(self.taxonomyCol,
                               data[self.taxonomyCol].cast(StringType()))

        concat_list = udf(lambda lst: ", ".join(lst), StringType())
        data = data.groupby(col).agg(
            collect_list(self.taxonomyCol).alias(self.taxonomyCol))

        data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol))
        data = data.withColumn(
            self.taxonomyCol,
            split(regexp_replace(self.taxonomyCol, " ", ""), ','))

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(data)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        return norma_data
Пример #9
0
def execute():
    data = spark.read.csv('/Users/brillap/downloads/kc_house_data.csv', header=True, inferSchema=True)
    # data = spark.read.csv('hdfs://hadoop-master:9000/user/root/kc_house_data.csv', header=True, inferSchema=True)
    assembler = VectorAssembler() \
        .setInputCols(["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors"]) \
        .setOutputCol("features") \
        .transform(data)
    # assembler.show()

    normalizer = Normalizer() \
        .setInputCol("features") \
        .setOutputCol("normFeatures") \
        .setP(2.0) \
        .transform(assembler)
    # normalizer.show()

    linear_regression = LinearRegression() \
        .setLabelCol("price") \
        .setFeaturesCol("normFeatures") \
        .setMaxIter(10) \
        .setRegParam(1.0) \
        .setElasticNetParam(1.0)

    result_array = normalizer.randomSplit([0.7, 0.3])
    lr_model = linear_regression.fit(result_array[0])

    training_summary = lr_model.summary
    print("RMSE: %f" % training_summary.rootMeanSquaredError)

    predicted_data = lr_model.transform(result_array[1]).select("features", "normFeatures", "price", "prediction")
    predicted_data.show()
Пример #10
0
def normalize(dataFrame, inputColNames, p_norm=2.0):
    if type(p_norm) is str:
        if p_norm.lower() == "inf":
            p_norm = float('inf')
        else:
            raise ValueError("The p_norm has to be float or 'inf'.")
    if type(inputColNames) is list:
        outputColName = "normalized features"
        assembler = VectorAssembler(inputCols=inputColNames, \
                                    outputCol="features")
        assembledDF = assembler.transform(dataFrame)
        normalizer=Normalizer(inputCol="features", \
                              outputCol=outputColName, \
                              p = p_norm)
        normalizedDF = normalizer.transform(assembledDF)
        colList = ""
        for inputColName in inputColNames:
            colList += " '" + inputColName + "' "
        if(p_norm == float('inf')):
            print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^inf norm and create two new columns 'features' and 'normalized features'.".format(colList))
        else:
            print ("Successfully assembled the column {0:s} to a feature vector and normalized using L^{1:f} norm and create two new columns 'features' and 'normalized features'.".format(colList, p_norm))
        return normalizedDF
    else:
        raise ValueError("The inputColNames has to be a list of columns to generate a feature vector and then do normalization.")
Пример #11
0
 def getNormalizer(self, dataFrameFeatures, outputColName):
     #define Normalizer to get normailize freatures
     normalized = Normalizer(inputCol="features",
                             outputCol=outputColName,
                             p=2.0)
     #Get Normalize feature
     normData = normalized.transform(dataFrameFeatures)
     return normData
Пример #12
0
def clustering(input_df, input_col_name, n):
    """ KMeans and PCA """
    input_df = input_df.select('state','categories','stars',input_col_name)
    norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
    df = norm.transform(input_df)
    kmeans = KMeans(k=n, seed=2)
    KMmodel = kmeans.fit(df)
    predicted = KMmodel.transform(df).cache()
    pca = PCA(k=2, inputCol='features', outputCol="pc")
    df =  pca.fit(dfsample).transform(dfsample).cache()
    return df
def get_feature_vector(input_df):
    assembler_1 = VectorAssembler(inputCols=["Open", "Close"],
                                  outputCol="stock_features")
    scaler = Normalizer(inputCol="stock_features",
                        outputCol="scaled_stock_features")
    assembled_df = assembler_1.transform(input_df)
    scaled_stock = scaler.transform(assembled_df).drop('stock_features')
    assembler_2 = VectorAssembler(
        inputCols=["scaled_stock_features", "Sentiment"], outputCol="features")
    final_df = assembler_2.transform(scaled_stock)
    return final_df.drop('scaled_stock_features')
Пример #14
0
def termFrequency(table):

    #calculates the term frequency of attributes
    hashingTF = HashingTF(inputCol='key_words', outputCol='hashing')
    tf = hashingTF.transform(table)
    tf.cache()

    #normalises the term frequency data
    normalizer = Normalizer(inputCol='hashing', outputCol='norm')
    term = normalizer.transform(tf)

    return term
Пример #15
0
 def normalizedDF(self):
     from pyspark.ml.feature import Normalizer
     assembler = VectorAssembler(inputCols=varNames, outputCol="features")
     normalizer = Normalizer(inputCol="features",
                             outputCol="normFeatures",
                             p=2)  #p is order of norm
     pipeline = Pipeline(stages=[assembler, normalizer])
     self.df_norm = pipeline.fit(vecdf)
     # Normalize each Vector using $L^\infty$ norm.
     lInfNormData = normalizer.transform(dataFrame,
                                         {normalizer.p: float("inf")})
     return (0)
Пример #16
0
def getNormalizerTest(dataFrameFeatures, outputColName):

    print("inside normalizer test")
    print(dataFrameFeatures)
    #define Normalizer to get normailize freatures
    normalized = Normalizer(inputCol="features",
                            outputCol=outputColName,
                            p=2.0)
    #Get Normalize feature
    normData = normalized.transform(dataFrameFeatures)
    #print normData.show()
    return normData
Пример #17
0
def transform_input(input_text):
    ''' '''
    lines = [(input_text, )]
    df = spark.createDataFrame(lines, ['text'])

    def removePunctuation(text):
        text = text.lower().strip()
        text = re.sub('[^0-9a-zA-Z ]', '', text)
        return text

    remove_punt_udf = udf(removePunctuation, StringType())

    tokenizer = Tokenizer(inputCol='text_noPunct', outputCol='token_text')
    df_new = df.withColumn('text_noPunct', remove_punt_udf('text'))
    df_new = tokenizer.transform(df_new)

    def remove_blank_token(text):
        text = list(filter(lambda x: x != '', text))
        return text

    remove_blank_token_udf = udf(remove_blank_token, ArrayType(StringType()))
    df_new = df_new.withColumn('token_text',
                               remove_blank_token_udf('token_text'))

    sw_remover = StopWordsRemover(inputCol='token_text',
                                  outputCol='stop_token')
    normalizer = Normalizer(inputCol='w2v', outputCol='w2v_norm')

    pipe = PipelineModel(stages=(sw_remover, w2v_model, normalizer))
    df_final = pipe.transform(df_new)

    return df_final
Пример #18
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    normal = [
        Normalizer(inputCol=column, outputCol=column + "_normalized", p=p)
        for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Пример #19
0
def tfidf_top_tokens(df, token_cols, min_freq=1):
    output = df
    for c in token_cols:
        pre = c
        cv = CountVectorizer(inputCol=pre,
                             outputCol=pre + '_rawFeatures',
                             minDF=min_freq)
        idf = IDF(inputCol=pre + "_rawFeatures",
                  outputCol=pre + "_features",
                  minDocFreq=min_freq)
        normalizer = Normalizer(p=2.0,
                                inputCol=pre + "_features",
                                outputCol=pre + '_tfidf')
        stages = [cv, idf, normalizer]
        pipeline = Pipeline(stages=stages)
        model = pipeline.fit(output)
        output = model.transform(output)\
          .drop(pre+'_rawFeatures', pre+'_features')

        cvModel = model.stages[0]
        vocab = spark.sparkContext.broadcast(cvModel.vocabulary)
        output = output.withColumn(
            pre + '_top_tokens',
            top_kw_from_tfidf(vocab, n=5)(f.col(pre + "_tfidf")))

    return output
Пример #20
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/
    normal = [
        Normalizer(inputCol=col_name,
                   outputCol=name_col(col_name, "normalized"),
                   p=p) for col_name in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Пример #21
0
def normalize(dataFrame, inputColNames, p_norm=2.0):
    if type(p_norm) is str:
        if p_norm.lower() == "inf":
            p_norm = float('inf')
        else:
            raise ValueError("The p_norm has to be float or 'inf'.")
    if type(inputColNames) is list:
        outputColName = "normalized features"
        assembledDF = getAssembledDataFrame(dataFrame, inputColNames)
        normalizer=Normalizer(inputCol="features", \
                              outputCol=outputColName, \
                              p = p_norm)
        normalizedDF = normalizer.transform(assembledDF).drop("features")
        return normalizedDF
    else:
        raise ValueError(
            "The inputColNames has to be a list of columns to generate a feature vector and then do normalization."
        )
Пример #22
0
def train_and_save_model_df(sc_local):
    trainingData = sc_local.textFile(FILE_TRAINING_DATA) \
        .flatMap(lambda line: parse_apache_log_line(line))
    data = trainingData.toDF()

    indexers = [
        StringIndexer(inputCol=c,
                      outputCol="{0}_indexed".format(c),
                      handleInvalid="keep") for c in ['endpoint', 'method']
    ]
    encoders = [
        OneHotEncoder(inputCol=indexer.getOutputCol(),
                      outputCol="{0}_encoded".format(indexer.getOutputCol()))
        for indexer in indexers
    ]
    assembler = VectorAssembler(
        inputCols=['response_code'] +
        [encoder.getOutputCol() for encoder in encoders],
        outputCol='features')
    pipeline = Pipeline(stages=indexers + encoders + [assembler])
    transform_model = pipeline.fit(data)
    output = transform_model.transform(data)

    remove_existing_model(TRANSFORM_MODEL_LOCATION)
    transform_model.save(TRANSFORM_MODEL_LOCATION)

    normalizer = Normalizer(inputCol="features",
                            outputCol="normFeatures",
                            p=1.0)
    output = normalizer.transform(output)

    kmeans = pyspark.ml.clustering.KMeans().setK(2).setSeed(1)
    model = kmeans.fit(output)

    remove_existing_model(MODEL_LOCATION)
    model.save(MODEL_LOCATION)

    predictions = model.transform(output)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)

    print('Silhouette: ', silhouette)
    costs = model.computeCost(output)
    print('Costs: ', costs)
Пример #23
0
    def term_frequency(self):
        # TODO: save vocabulary to firebase
        beers = self.beer_reviews
        cv = CountVectorizer(inputCol='lemmatized_tokens',
                             outputCol='features_tf',
                             vocabSize=7500)
        # cv_model = cv.fit(self.beer_reviews)
        # self.beer_reviews = cv_model.transform(self.beer_reviews)
        cv_model = cv.fit(beers)
        # self.beer_reviews = cv_model.transform(beers)
        beers = cv_model.transform(beers)
        self.vocabulary = {
            idx: val.encode('utf-8')
            for idx, val in enumerate(cv_model.vocabulary)
        }

        normalizer = Normalizer(inputCol='features_tf',
                                outputCol='features_normalized')
        # self.beer_reviews = normalizer.transform(self.beer_reviews)
        self.beer_reviews = normalizer.transform(beers)
Пример #24
0
def execute():
    input_data = spark.read.csv('hdfs://hadoop-master:9000/kc_house_data.csv',
                                header=True,
                                inferSchema=True)
    # input_data = spark.read.csv('/Users/krithikab/Desktop/PRACT/SparkML/kc_house_data.csv', header=True, inferSchema=True)

    data = input_data\
        .filter(input_data.price > 0)\
        .withColumn("age", 2020-input_data.yr_built)\
        .drop_duplicates()

    assembler = VectorAssembler() \
        .setInputCols(
        ["bedrooms", "bathrooms", "sqft_living", "floors", "condition", "sqft_lot",
         "waterfront", "view", "grade", "sqft_above", "sqft_basement", "age",
         "zipcode", "lat", "long", "sqft_living15", "sqft_lot15"]) \
        .setOutputCol("features") \
        .transform(data)

    normalizer = Normalizer() \
        .setInputCol("features") \
        .setOutputCol("normFeatures") \
        .transform(assembler)

    linear_regression = LinearRegression() \
        .setLabelCol("price") \
        .setFeaturesCol("normFeatures") \
        .setMaxIter(10) \
        .setRegParam(1.0) \
        .setElasticNetParam(1.0)

    result_array = normalizer.randomSplit([0.7, 0.3])
    lr_model = linear_regression.fit(result_array[0])

    predicted_data = lr_model.transform(result_array[1]).select(
        "features", "normFeatures", "price", "prediction")
    # predicted_data.select("price", "prediction").write.csv("result.csv")
    predicted_data.select(
        "price",
        "prediction").write.csv("hdfs://hadoop-master:9000/prediction.csv")
Пример #25
0
def predict(team_a,team_b):
    col=['player_name','Usg%','Per','time']
    dataA=[]
    dataB=[]
    for player in team_a:
        playerInfo = team_a[player]
        name = playerInfo[0]
        time = int(playerInfo[1])
        Usgp = pd_df.loc[pd_df['player_name']==name,'Usg%'].values[0]
        Per = pd_df.loc[pd_df['player_name']==name,'Per'].values[0]
        dataA.append((name,Usgp,Per,time))
    #print(dataA)
    for player in team_b:
        playerInfo = team_b[player]
        name = playerInfo[0]
        #print(name)
        time = int(playerInfo[1])
        Usgp = pd_df.loc[pd_df['player_name']==name,'Usg%'].values[0]
        Per = pd_df.loc[pd_df['player_name']==name,'Per'].values[0]
        dataB.append((name,Usgp,Per,time))
    #print(dataB)
    col=['player_name','Usg%','Per','time']
    pd_tmp = pandas.DataFrame(dataA,columns=col)
    df_teamA = sqlContext.createDataFrame(pd_tmp)
    pd_tmp=pandas.DataFrame(dataB,columns=col)
    df_teamB = sqlContext.createDataFrame(pd_tmp)
    #df_teamA.show()
    #df_teamB.show()
    vector = VectorAssembler(inputCols=['Usg%','Per','time'],outputCol='features')
    normalizer = Normalizer(p=2.0, inputCol="features", outputCol="norm_test")
    pipeline = Pipeline(stages=[vector,normalizer])
    pipeline_fit = pipeline.fit(df_teamA)
    df_A = pipeline_fit.transform(df_teamA)
    #df_A.show()
    pipeline_fit = pipeline.fit(df_teamB)
    df_B = pipeline_fit.transform(df_teamB)
    #df_B.show()
    model = cl.RandomForestClassificationModel.load('Model_v3_0')
    predictions_A = model.transform(df_A)
    #predictions_A.show()
    predictions_B = model.transform(df_B)
    #predictions_B.show()
    percentageA = 100 / predictions_A.count()
    percentageB = 100 / predictions_B.count()
    a = predictions_A.where(predictions_A['prediction']==1).count()
    b = predictions_B.where(predictions_B['prediction']==0).count()
    percentage = (a* percentageA + b*percentageB)/2
    return percentage
Пример #26
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:

    assert isinstance(input_cols, (str, list)), \
        "Error: %s argument must be a string or a list." % "input_cols"

    if isinstance(input_cols, str):
        input_cols = [input_cols]

    assert isinstance(
        p, (float, int)), "Error: p argument must be a numeric value."

    # Convert ArrayType() column to DenseVector
    def arr_to_vec(arr_column):
        """
        :param arr_column: Column name
        :return: Returns DenseVector by converting an ArrayType() column
        """
        return DenseVector(arr_column)

    # User-Defined function
    # TODO: use apply() to use Pyarrow
    udf_arr_to_vec = F.udf(arr_to_vec, VectorUDT())

    # Check for columns which are not DenseVector types and convert them into DenseVector
    for col in input_cols:
        if not is_(df[col], DenseVector):
            df = df.withColumn(col, udf_arr_to_vec(df[col]))

    normal = [
        Normalizer(inputCol=column, outputCol=column + "_normalized", p=p)
        for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Пример #27
0
def get_feature_eng_stages(categoricalColumns, label="has_heart_disease"):

  stages = [] # stages in our Pipeline
  for categoricalCol in categoricalColumns:
      # Category Indexing with StringIndexer
      stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
      # Use OneHotEncoder to convert categorical variables into binary SparseVectors
      encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
      # Add stages.  These are not run here, but will run all at once later on.
      stages += [stringIndexer, encoder]
      
  label_stringIdx = StringIndexer(inputCol = label, outputCol="label")
  stages += [label_stringIdx]
  numericCols = ["age", "is_smoker"]
  assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
  assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="raw_features")
  normalizer = Normalizer(inputCol="raw_features", outputCol="features", p=1.0)
  stages += [assembler,normalizer]
  
  return(stages)
Пример #28
0
df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2) 
                          
Пример #29
0
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("NormalizerExample")\
        .getOrCreate()

    # $example on$
    dataFrame = spark.createDataFrame([
        (0, Vectors.dense([1.0, 0.5, -1.0]),),
        (1, Vectors.dense([2.0, 1.0, 1.0]),),
        (2, Vectors.dense([4.0, 10.0, 2.0]),)
    ], ["id", "features"])

    # Normalize each Vector using $L^1$ norm.
    normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
    l1NormData = normalizer.transform(dataFrame)
    print("Normalized using L^1 norm")
    l1NormData.show()

    # Normalize each Vector using $L^\infty$ norm.
    lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
    print("Normalized using L^inf norm")
    lInfNormData.show()
    # $example off$

    spark.stop()
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([(
    0,
    Vectors.dense([1.0, 0.5, -1.0]),
), (
    1,
    Vectors.dense([2.0, 1.0, 1.0]),
), (
    2,
    Vectors.dense([4.0, 10.0, 2.0]),
)], ["id", "features"])

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show()

# COMMAND ----------

###MinMaxScaler (0, 1)
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
Пример #31
0
df_energy.createOrReplaceTempView('df_energy')

# In[6]:

df_join = spark.sql("""
select * from df inner join df_energy on df.class=df_energy.class
""")
df_join.show()

# In[7]:

from pyspark.ml.feature import VectorAssembler, Normalizer

vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

# In[8]:

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# In[9]:

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, normalizer, lr])

# In[10]:

model = pipeline.fit(df_join)
# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import Normalizer
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()


# COMMAND ----------

valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()
    tags_users_df=sqlContext.createDataFrame(tags_users)
    print(tags_users_df.take(2))
    #
    #
    # print('Indexing strings')
    cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.)
    model=cVec.fit(tags_users_df)
    td=model.transform(tags_users_df)

    with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff:
        pkl.dump(model.vocabulary,ff)



    normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized')
    tdNorm=normalizer.transform(td)
    print(tdNorm.take(5))

    tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet')

    samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10)
    #pprint(samples)




    # stringIndexer = StringIndexer(inputCol="tags", outputCol="indexed_tags")
    # model=stringIndexer.fit(tags_users_df)
    # td=model.transform(tags_users_df)
    # print('Retrieving indices')
def build_model(df_ml):
    '''
    Function builds machine learning model to predict churn
    
    INPUT:
        df_ml - dataset which contains user features to predict customer churn
        
    OUTPUT:
        model - model which predicts customer churn
    '''

    # split into train, test and validation sets (60% - 20% - 20%)
    df_ml = df_ml.withColumnRenamed("churn", "label")

    train, test_valid = df_ml.randomSplit([0.6, 0.4], seed=42)
    test, validation = test_valid.randomSplit([0.5, 0.5], seed=42)

    # index and encode categorical features gender, level and state

    stringIndexerGender = StringIndexer(inputCol="gender",
                                        outputCol="genderIndex",
                                        handleInvalid='skip')
    stringIndexerLevel = StringIndexer(inputCol="last_level",
                                       outputCol="levelIndex",
                                       handleInvalid='skip')
    stringIndexerState = StringIndexer(inputCol="last_state",
                                       outputCol="stateIndex",
                                       handleInvalid='skip')

    encoder = OneHotEncoderEstimator(
        inputCols=["genderIndex", "levelIndex", "stateIndex"],
        outputCols=["genderVec", "levelVec", "stateVec"],
        handleInvalid='keep')

    # create vector for features
    features = [
        'genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs',
        'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend'
    ]
    assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures")

    # normalize features
    normalizer = Normalizer(inputCol="rawFeatures",
                            outputCol="features",
                            p=1.0)

    # initialize random forest classifier with tuned hyperparameters
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                numTrees=100,
                                impurity='gini',
                                maxDepth=5,
                                featureSubsetStrategy='sqrt')

    # assemble pipeline
    pipeline = Pipeline(stages=[
        stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder,
        assembler, normalizer, rf
    ])

    # fit model
    model = pipeline.fit(train)

    # predict churn
    pred_train = model.transform(train)
    pred_test = model.transform(test)
    pred_valid = model.transform(validation)

    # evaluate results
    predictionAndLabels = pred_train.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # print F1-score
    print("F1 score on train dataset is %s" % metrics.fMeasure())

    predictionAndLabels = pred_test.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # F1 score
    print("F1 score on test dataset is %s" % metrics.fMeasure())

    predictionAndLabels = pred_valid.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # F1 score
    print("F1 score on validation dataset is %s" % metrics.fMeasure())

    return model
Пример #35
0
	return wordbag
	

documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id'])

#users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department')
#for u in users.select('department','user').take(10000):
#	print u
'''
professors = documents.select('prof_name').distinct()
department = documents.select('department').distinct()
#grade	1/2/3/4
eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5

for e in eval_total.collect():
	print e
'''



htf = HashingTF(inputCol= 'words',outputCol = 'rawFeatures')
featured = htf.transform(documents)
idf = IDF(inputCol = 'rawFeatures',outputCol = 'idf')
idfModel = idf.fit(featured)
tf_idf = idfModel.transform(featured)
normalizer = Normalizer(inputCol = 'idf', outputCol = 'idf_norm', p = 2.0)
normData = normalizer.transform(tf_idf)

normData.rdd.saveAsPickleFile('idf_normalized')

for i in range(len(pred)):
    if pred[i] == list(Y_test)[i]:
        corr+=1
    else :
        pass
print('正确率:'+str(corr*1.01/len(pred)/1.01))

#Spark上进行BP神经网络建模
from pyspark.sql import Row
from pyspark.ml.feature import Normalizer  
lines = sc.textFile("hdfs:///lushun/a.txt")
parts = lines.map(lambda l: l.split(" "))
df = parts.map(lambda p: Row(features=p[:-1], labe1=int(p[-1])))
df = spark.createDataFrame(df)
df.createOrReplaceTempView("df")
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)  
l1NormData = normalizer.transform(df)
l1NormData = spark.sql("SELECT labe1,normFeatures FROM l1NormData")  
l1NormData.show()       
from pyspark.ml.classification import MultilayerPerceptronClassifier  
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
splits = lInfNormData.randomSplit([0.7, 0.3]) 
train = splits[0]  
test = splits[1]
layers = [36300, 200, 200, 6] 
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers,seed=1234)
model = trainer.fit(train)  
# compute accuracy on the test set  
result = model.transform(test)  
predictionAndLabels = result.select("prediction", "label")  
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")  
Пример #37
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler 
vectorassembler = VectorAssembler(
							inputCols = ['x','y', 'z'],
							outputCol = 'features'
							)
features_vectorized = vectorassembler.transform(encoded)
features_vectorized.show()


# In[12]:


from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol = 'features', outputCol='features_norm', p=1.0)
normalized_data = normalizer.transform(features_vectorized)
normalized_data.show()


# In[17]:


from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [
					indexer, encoder, vectorassembler, normalizer
					])
					
					
model = pipeline.fit(df)					
prediction = model.transform(df)
Пример #39
0
# à la place on crée une deuxième dataframe où on ajoute la colonne qu'on veut.
dfVect = dfBigram.withColumn("words", udfVectorizeUni("words"))
# On a bien remplacé ici du coup les mots par les vecteurs sparse
print "DataFrame(1-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect.show()


udfVectorizeBi=UserDefinedFunction(lambda x : vectorizeBi(x),VectorUDT())
dfVect2 = dfVect.withColumn("bigrams", udfVectorizeBi("bigrams"))
print "DataFrame(bi-gram): On a bien remplacé ici du coup les mots par les vecteurs sparse"
dfVect2.show()

# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
from pyspark.ml.feature import Normalizer
normalizerUni = Normalizer(inputCol='words',outputCol='normWords',p=2.0)
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF