Пример #1
0
def hash_rating(author_subreddit_rating_rdd, sc):
    sql_context = SQLContext(sc)

    author_sub_schema = StructType([
        StructField("author", StringType(), True),
        StructField("subreddit", StringType(), True),
        StructField("rating", LongType(), True)
    ])
    asr_df = sql_context.createDataFrame(author_subreddit_rating_rdd, author_sub_schema)

    author_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): a)
    aid_rdd = author_rdd.distinct().zipWithUniqueId().cache()
    author_id_schema = StructType([
        StructField("author", StringType(), True),
        StructField("author_id", LongType(), True)
    ])
    aid_df = sql_context.createDataFrame(aid_rdd, author_id_schema)
    aid_s_r_df = aid_df.join(asr_df, on='author').drop('author').cache()

    subreddit_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): s)
    sid_rdd = subreddit_rdd.distinct().zipWithUniqueId().cache()
    subreddit_id_schema = StructType([
        StructField("subreddit", StringType(), True),
        StructField("subreddit_id", LongType(), True)
    ])
    sid_df = sql_context.createDataFrame(sid_rdd, subreddit_id_schema)
    aid_sid_r_df = sid_df.join(aid_s_r_df, on='subreddit').drop('subreddit').cache()
    row_aid_sid_r_rdd = aid_sid_r_df.rdd
    aid_sid_r_rdd = row_aid_sid_r_rdd.map(lambda row: (row.author_id, row.subreddit_id, row.rating))

    return aid_rdd, sid_rdd, aid_sid_r_rdd
Пример #2
0
def mat_fact_test(city, rank=5, iter_=20, lambda_ = .01):
    pitt_train = pd.read_csv('/mnt/sda5/Desktop/search/Fin_project/dataset/final/'+city+'_train.csv',encoding = 'iso8859_15')
    pitt_test = pd.read_csv('/mnt/sda5/Desktop/search/Fin_project/dataset/final/'+city+'_test.csv',encoding = 'iso8859_15')
    pitt_test = get_test(pitt_train,pitt_test)
    pitt_test = resample(pitt_test,n_samples=int(np.ceil(0.8 * pitt_test.shape[0])))
    pitt_test = pitt_test.reset_index()
    pitt_test = pitt_test.drop('index', axis = 1)
    le_user_id = preprocessing.LabelEncoder()
    le_user_id = le_user_id.fit(pitt_train.user_id)
    user_id_enc = le_user_id.transform(pitt_train.user_id)
    pitt_train['user_id_enc'] = user_id_enc
    pitt_test['user_id_enc'] = le_user_id.transform(pitt_test.user_id)
    le_business_id = preprocessing.LabelEncoder()
    le_business_id = le_business_id.fit(pitt_train.business_id)
    business_id_enc = le_business_id.transform(pitt_train.business_id)
    pitt_train['business_id_enc'] = business_id_enc
    pitt_test['business_id_enc'] = le_business_id.transform(pitt_test.business_id)
    sqlCtx = SQLContext(sc)
    pitt_train_sp = sqlCtx.createDataFrame(pitt_train[['user_id_enc','business_id_enc','stars_review']])
    pitt_train_sp = pitt_train_sp.withColumn("stars_review", pitt_train_sp["stars_review"].cast("double"))
    pitt_test_sp = sqlCtx.createDataFrame(pitt_test[['user_id_enc','business_id_enc','stars_review']])
    pitt_test_sp = pitt_test_sp.withColumn("stars_review", pitt_test_sp["stars_review"].cast("double"))
    model = ALS.train(pitt_train_sp, rank, seed=0, iterations=iter_,lambda_=lambda_)
    prediction=model.predictAll(pitt_train_sp.rdd.map(lambda line: (line[0],line[1]))).map(lambda d: ((d[0],d[1]),d[2]))
    true_and_pred = pitt_train_sp.rdd.map(lambda d: ((d[0],d[1]),d[2])).join(prediction).map(lambda r:(r[0],r[1][0], r[1][1]))
    true_and_pred.map(lambda line:(line[0],line[1],5 if line[2]>=5 else line[2]))
    error = math.sqrt(true_and_pred.map(lambda r: (math.fabs(r[1]-r[2]))**1).mean())
    print('Training: ',error)
    prediction=model.predictAll(pitt_test_sp.rdd.map(lambda line: (line[0],line[1]))).map(lambda d: ((d[0],d[1]),d[2]))
    true_and_pred = pitt_test_sp.rdd.map(lambda d: ((d[0],d[1]),d[2])).join(prediction).map(lambda r:(r[0],r[1][0], r[1][1]))
    true_and_pred.map(lambda line:(line[0],line[1],5 if line[2]>=5 else line[2]))
    error = math.sqrt(true_and_pred.map(lambda r: (math.fabs(r[1]-r[2]))**1).mean())
    print('Test: ',error)
Пример #3
0
def pyspark():
    conf = SparkConf().setAppName("PySparkApp").setMaster("local")
    #conf = SparkConf()
    sc = SparkContext(conf=conf)

    #spark = SparkSession.builder.appName("WordCount").master("local").config(conf = conf).getOrCreate()
    sqlCtx = SQLContext(sc)

    df1 = get_features()
    sdf = sqlCtx.createDataFrame(df1)

    ops1 = "(price_from + price_to)/2"
    data = sdf.withColumn("MedianPrice", expr(ops1))

    tmp = data.withColumn('final_price',
                          coalesce(data['Price123'], data['MedianPrice']))

    finaldata = tmp.drop("price", "disFeature")

    state = {
        "VIC": "Victoria",
        "WA": "Western Australia",
        "ACT": "Australian Capital Territory",
        "NT": "Northern Territory",
        "NSW": "New South Wales",
        "TAS": "Tasmania"
    }

    stateDataP = pd.DataFrame(list(state.items()),
                              columns=["State", "StateName"])

    stateDataD = sqlCtx.createDataFrame(stateDataP)

    data1 = finaldata.join(stateDataD, on=['State'], how='inner')

    finaldataPD = data1.toPandas()
    #dataPD["StateName"].unique()

    sc.stop()

    finaldataPD['price_to'] = finaldataPD['price_to'].astype(str).astype(float)

    finaldataPD['Price123'] = finaldataPD['Price123'].astype(str).astype(float)

    finaldataPD['beds'] = finaldataPD['beds'].astype(str).astype(int)

    finaldataPD['baths'] = finaldataPD['baths'].astype(str).astype(int)

    finaldataPD['parking'] = finaldataPD['parking'].astype(str).astype(int)

    df123 = finaldataPD.copy()

    df123 = df123.replace({pd.np.nan: None})

    #print(df123)

    return df123
    def run(self):
        startTime = time.time()
        conf = SparkConf() \
            .setAppName("Community_Detection_Based_on_GraphFrames") \
            .set("spark.executor.memory", "4g")\
            .set("spark.driver.host", "localhost")
        sc = SparkContext(conf=conf)

        inputData = sc.textFile(self.input_path)

        # Drop the header
        header = inputData.first()
        inputData = inputData.filter(lambda line: line != header)

        # read and split data into tuples
        Standard_RDD = inputData.map(self.readAndSplit)

        UserAndItems = Standard_RDD.groupByKey().map(lambda x:
                                                     (x[0], set(list(x[1]))))
        self.global_User_items = UserAndItems.collectAsMap()

        edge_RDD = UserAndItems.flatMap(
            self.generate_edges).filter(lambda x: len(x) > 0)

        vertex_RDD = edge_RDD.flatMap(lambda x: [x[0], x[1]]).distinct().map(
            lambda x: (x, ))

        sqlContext = SQLContext(sc)

        vertices = sqlContext.createDataFrame(vertex_RDD.collect(), [
            "id",
        ])

        edges = sqlContext.createDataFrame(edge_RDD.collect(), [
            "src",
            "dst",
        ])

        g = GraphFrame(vertices, edges)

        result = g.labelPropagation(maxIter=5)

        verticeRDD = sc.parallelize(result.collect())
        community_list = verticeRDD.map(lambda x: (str(x.label), x.id))\
                                    .groupByKey()\
                                            .map(lambda x: sorted(list(x[1])))\
                                                    .sortBy(lambda x: (len(x), x[0]))\
                                                            .collect()
        with open(self.output_path, 'w') as f:
            for line in community_list:
                for each in line[:-1]:
                    f.write(each + ', ')
                f.write(line[-1] + '\n')
        print("Finish time:", time.time() - startTime)
Пример #5
0
def mat_fact_val(city, rank=5, iter_=20, lambda_=.01):
    toronto_train = pd.read_csv('D:/Study/TermProject/yelp-dataset/data/' +
                                city + '_train.csv',
                                encoding='iso8859_15')
    toronto_val = pd.read_csv('D:/Study/TermProject/yelp-dataset/data/' +
                              city + '_val.csv',
                              encoding='iso8859_15')
    toronto_val = get_test(toronto_train, toronto_val)
    le_user_id = preprocessing.LabelEncoder()
    le_user_id = le_user_id.fit(toronto_train.user_id)
    user_id_enc = le_user_id.transform(toronto_train.user_id)
    toronto_train['user_id_enc'] = user_id_enc
    toronto_val['user_id_enc'] = le_user_id.transform(toronto_val.user_id)
    le_business_id = preprocessing.LabelEncoder()
    le_business_id = le_business_id.fit(toronto_train.business_id)
    business_id_enc = le_business_id.transform(toronto_train.business_id)
    toronto_train['business_id_enc'] = business_id_enc
    toronto_val['business_id_enc'] = le_business_id.transform(
        toronto_val.business_id)
    sqlCtx = SQLContext(sc)
    toronto_train_sp = sqlCtx.createDataFrame(
        toronto_train[['user_id_enc', 'business_id_enc', 'stars_review']])
    toronto_train_sp = toronto_train_sp.withColumn(
        "stars_review", toronto_train_sp["stars_review"].cast("double"))
    toronto_val_sp = sqlCtx.createDataFrame(
        toronto_val[['user_id_enc', 'business_id_enc', 'stars_review']])
    toronto_val_sp = toronto_val_sp.withColumn(
        "stars_review", toronto_val_sp["stars_review"].cast("double"))
    model = ALS.train(toronto_train_sp,
                      rank,
                      seed=0,
                      iterations=iter_,
                      lambda_=lambda_)
    prediction = model.predictAll(
        toronto_train_sp.rdd.map(lambda line: (line[0], line[1]))).map(
            lambda d: ((d[0], d[1]), d[2]))
    true_and_pred = toronto_train_sp.rdd.map(lambda d: ((d[0], d[1]), d[
        2])).join(prediction).map(lambda r: (r[0], r[1][0], r[1][1]))
    true_and_pred.map(lambda line: (line[0], line[1], 5
                                    if line[2] >= 5 else line[2]))
    error = math.sqrt(
        true_and_pred.map(lambda r: (math.fabs(r[1] - r[2]))**1).mean())
    print('Training: ', error)
    prediction = model.predictAll(
        toronto_val_sp.rdd.map(lambda line: (line[0], line[1]))).map(
            lambda d: ((d[0], d[1]), d[2]))
    true_and_pred = toronto_val_sp.rdd.map(lambda d: ((d[0], d[1]), d[
        2])).join(prediction).map(lambda r: (r[0], r[1][0], r[1][1]))
    true_and_pred.map(lambda line: (line[0], line[1], 5
                                    if line[2] >= 5 else line[2]))
    error = math.sqrt(
        true_and_pred.map(lambda r: (math.fabs(r[1] - r[2]))**1).mean())
    print('Validation: ', error)
Пример #6
0
def query2(sc, file_in_name, file_out_name):

    rdd_file_data = sc.textFile(file_in_name)

    data_header = rdd_file_data \
        .filter(lambda l: "datetime" in l)

    cites = weather.gen_city_keys(sc)

    header_position = run2.get_position(data_header)

    data = rdd_file_data \
        .subtract(data_header) \
        .flatMap(lambda line: generate_tuple(header_position, line, cites))

    sqlc = SQLContext(sc)

    df = sqlc.createDataFrame(data)
    #df.show()
    df.createOrReplaceTempView("dati")

    query1 = "SELECT country, year, month, " \
             "cast(min(value) as decimal (10,2)) as my_min, " \
             "cast(max(value) as decimal (10,2)) my_max, " \
             "cast(avg(value) as decimal (10,2)) as my_avg," \
             "cast(stddev(value) as decimal (10,2)) as my_std " \
             "FROM dati " \
             "GROUP BY country, year, month"
    df2 = sqlc.sql(query1).orderBy('dati.country', 'dati.year', 'dati.month')
    df2.show()
    '''
            Save data in HDFS
    '''
    df2.coalesce(1).write.format("json").save(file_out_name)
def get_spark_runtime_validator(context, df):
    from pyspark import SparkContext, SQLContext

    sc = SparkContext.getOrCreate()
    sqlCtx = SQLContext(sc)
    sdf = sqlCtx.createDataFrame(df)
    batch_request = BatchRequest(
        datasource_name="my_spark_datasource",
        data_connector_name="my_data_connector",
        batch_data=sdf,
        data_asset_name="IN_MEMORY_DATA_ASSET",
        partition_request={
            "batch_identifiers": {
                "an_example_key": "a",
                "another_example_key": "b",
            }
        },
    )

    expectation_suite = context.create_expectation_suite(
        "my_suite", overwrite_existing=True)

    validator = context.get_validator(batch_request=batch_request,
                                      expectation_suite=expectation_suite)

    return validator
Пример #8
0
def run():
    conf = SparkConf().setAppName("word count") \
      .setMaster("local[2]")

    context = SparkContext(conf=conf)
    sqlContext = SQLContext(context)
    context.setLogLevel('ERROR')

    data = [('Song', 25), ('Trump', 22), ('Yong', 20), ('Obama', 26)]
    rdd = context.parallelize(data,
                              2).map(lambda x: Row(name=x[0], age=int(x[1])))
    people = sqlContext.createDataFrame(rdd).cache()

    people.printSchema()
    old_guy = people.orderBy('age', ascending=False).take(1)
    print(old_guy)

    same_old_guy = [
        Row(name=x['name'], age=x['age'], other=1) for x in old_guy
    ]
    print(same_old_guy)

    total = people.groupBy().sum('age').collect()[0][0]
    print('Total age is {}'.format(total))

    people.createTempView('people_table')
    new_people = sqlContext.sql(
        'select name, age from people_table order by age desc limit 1')
    new_people.show()
Пример #9
0
def test_udf(spark_context, spark_session):
    sql_sc = SQLContext(spark_context)
    spark_session.conf.set("spark.sql.execution.arrow.enabled", "true")

    df = sql_sc.createDataFrame([(1, "John Doe", 21)],
                                ("id", "name", "age"))
    # df.printSchema()
    #
    # df.show()

    slen = pandas_udf(lambda s: s.str.len(), IntegerType())
    # below is similar to @
    # upper = pandas_udf(to_upper, StringType())
    # addOne = pandas_udf(add_one, IntegerType(), PandasUDFType.SCALAR)
    # this works
    df.select("name").show()

    # this doesn't work, Caused by: java.io.EOFException
    # 	at java.io.DataInputStream.readInt(DataInputStream.java:392)
    # seems related to slen output int
    # df.select(slen("name").alias("slen(name)")).show()
    # TODO this hit same error
    # df.select(to_upper("name")).show()

    print(df.select(slen("name").alias("slen(name)"),
              to_upper("name"), add_one("age")).count())
Пример #10
0
def main():
	conf = SparkConf().setAppName('ingest logs')
	sc = SparkContext(conf=conf)
	sqlContext = SQLContext(sc)

	inputs = sys.argv[1] 
	output = sys.argv[2]

	#Reading the input file, and then matching the pattern
	file_data = sc.textFile(inputs)
	linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$")
	
	#Mapping the data after fetching the required values out of the Nasa Web server logs file
	KeyValue = file_data.map(lambda line : linere.split(line)).filter(lambda x : len(x)==6).map(lambda y : (y[1],(dt.datetime.strptime(y[2], '%d/%b/%Y:%H:%M:%S')),y[3],y[-2])).cache()
	
	#Mapping the KeyValue RDD as the required format of 4 columns
	Nasa = KeyValue.map(lambda p: {"host": p[0], "datetime": p[1], "path": p[2], "bytes": long(p[3])})
	
	#Converting Nasa to DataFrame and then registering it as Table
	schemaNasa = sqlContext.createDataFrame(Nasa)
	schemaNasa.registerTempTable("NasaLogs")

	#Writing the data into a parquet file
	schemaNasa.write.format('parquet').save(output)
	
	#Reading the data from Parquet file and then Registering it in Table Format
	parquetdata = sqlContext.read.parquet(output)
	parquetdata.registerTempTable("parquetTable")

	#Firing SQL query to count the total number of bytes transferred using SUM(bytes)
	totalbytes = sqlContext.sql("""
    	SELECT SUM(bytes)
    	FROM parquetTable
	""")
	totalbytes.show()
Пример #11
0
    def astype(cls, df, out_type, **kwargs):
        """
        @param::out_type: the type of output datafram in string
        return the converted dataframe or None if not feasible
        """
        # handle edge cases
        if not isinstance(df, DataFrame):
            raise Exception(
                '> PandasConverter astype(): input dataframe must be instance of pyspark dataframe class.'
            )
        if out_type == None:
            raise ValueError(
                '> PandasConverter astype(): dataframe out_type parameter can not be none.'
            )
        if not cls.is_capable('pandas', out_type):
            raise Exception(
                '> PandasConverter astype(): convert to type: %s not supported.'
                % (out_type))

        # get pyspark context
        sc = SparkContext.getOrCreate()
        sqlcontext = SQLContext(sc)

        # convert to target type
        if out_type.lower() == 'pyspark':  # explicity intended
            try:
                return sqlcontext.createDataFrame(df)
            except Exception as e:
                print(
                    '> PandasConverter astype(): convert to pyspark dataframe failed: %s'
                    % (e))
        if out_type.lower() == 'pandas':  # explicity intended
            return df
        return None
Пример #12
0
    def readFromCsv(self, spark):
        print("Reading from CSV")

        sqlContext = SQLContext(sc)

        schema = StructType([])
        df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
        print("First SparkContext:")
        print("APP Name :".format(spark.sparkContext.appName))
        print("Master :" + spark.sparkContext.master)
        messageLogger = ml.MessageLogger(const.getProjectName(__file__),
                                         "Reading from file.....")
        try:
            messageLogger.logInfo("Reading from CSV file.")
            df = spark.read.csv(const.csv_file_project_1,
                                inferSchema=True,
                                header=True)
            messageLogger.logInfo("File reading finished successfully.")
        except Exception as e:
            messageLogger.logError(
                "unable to read the file, exception occurred: " +
                str(e.__class__) + "occurred.")
        if df.count() > 0:
            messageLogger.logInfo("Number of records in file: " +
                                  str(df.count()))

        # Display Data Frame Results
        #         processedData = pf.ProcessedData()
        #         processedData.processOutput("hellodcddd")
        # df.select('*').show()  # 100, False)

        # Data Frame Filter Statements
        # df.filter(df['eq_site_limit'] == 0).select('*').show()
        df.filter(df['eq_site_limit'] == 0
                  & df['hu_site_limit'] > 20000).select('*').show()
def Transfer_to_DB(spark, df):
    #Create PySpark DataFrame Schema
    r_schema = StructType([StructField('id',IntegerType(),True)\
                        ,StructField('h1',DoubleType(),True)\
                        ,StructField('h2',DoubleType(),True)\
                        ,StructField('h3',DoubleType(),True)\
                        ,StructField('h4',DoubleType(),True)\
                        ,StructField('h5',DoubleType(),True)\
                        ,StructField('h6',DoubleType(),True)\
                        ,StructField('h7',DoubleType(),True)\
                        ,StructField('h8',DoubleType(),True)\
                        ,StructField('h9',DoubleType(),True)\
                        ,StructField('h10',DoubleType(),True)\
                        ,StructField('h11',DoubleType(),True)\
                        ,StructField('h12',DoubleType(),True)\
                        ,StructField('services',StringType(),True)])

    sqlContext = SQLContext(spark)
    #Create Spark DataFrame from Pandas
    df_record = sqlContext.createDataFrame(df, r_schema)
    #Important to order columns in the same order as the target database
    df_record  = df_record.select("id","h1","h2","h3","h4","h5","h6",\
                                  "h7","h8","h9","h10","h11","h12","services")
    df_record.show()
    properties, url = DB_connection()
    df_record.write.jdbc(url=url,
                         table='patient_records',
                         mode='append',
                         properties=properties)
Пример #14
0
def evaluate(sc, models, test):
    sqlc = SQLContext(sc)
    results_schema = StructType([
        StructField("classifier", StringType()),
        StructField("auc", FloatType())
    ])
    results = sqlc.createDataFrame(sc.emptyRDD(), schema=results_schema)
    for classifier, model in models.items():
        bce = BinaryClassificationEvaluator(labelCol="class")
        auc = bce.evaluate(model.transform(test))

        evaluation = sc.parallelize([(classifier, auc)])
        evaluation = sqlc.createDataFrame(evaluation, schema=results_schema)

        results = results.union(evaluation)

    results.coalesce(1).write.csv("test-metrics", header=True)
Пример #15
0
def group_by(spark_context, spark_session):
    sql_sc = SQLContext(spark_context)
    spark_session.conf.set("spark.sql.execution.arrow.enabled", "true")

    df = sql_sc.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
                                ("id", "v"))
    # TODO count return Py4JJavaError: An error occurred while calling o71.count.
    df.groupby("id").apply(normalize).count()
Пример #16
0
    def main(self):
        stop_words = []
        # prod
        dataframe = self.read_dataframe(self.path, self.days_list).persist()

        # read approved user list
        # df = self.spark.read.csv(
        #     "hdfs:///ssymmetry_db/raw_db/sina_user_tag/sina_user_tag_item/weibo_uid_with_user_tag.csv")\
        #     .select("uid", "user_tag")

        # local test
        # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json")

        blog_rdd = self.read_blog_data(dataframe).fillna(" ").rdd

        def preprocess_data(x):
            uid = x["uid"]
            blog_content = x["blog_content"]
            forward_content = x["forward_content"]
            if forward_content.rfind(u"*****") > 0:
                forward_content = forward_content.split(u"*****")[1]
            return (uid, blog_content + forward_content)

        data = blog_rdd.map(preprocess_data).reduceByKey(
            lambda x, y: x + y).map(
                lambda x: [" ".join(jieba.cut(x[1])).split(" ")])

        sql_context = SQLContext(sparkContext=self.spark.sparkContext)
        word_df = sql_context.createDataFrame(data, ["values"])

        w2vec = Word2Vec(vectorSize=128, inputCol="values")
        model = w2vec.fit(word_df)

        def creat_dictionary(model):
            w_df = model.getVectors()
            w_df.show()
            data = w_df.rdd.collect()
            w2index = {}
            w2vec = {}
            i = 1
            for row in data:
                word = row.word
                vector = row.vector
                w2index[word] = i
                w2vec[word] = vector
                i += 1
            return w2index, w2vec

        # 把word2vec的词向量写出到一个pickle文件中
        index_dict, word_vectors = creat_dictionary(model)
        # out = open("w2vec.pkl", "wb")
        out = open("/udisk2/hxk/w2vec/w2vec.pkl", "wb")
        pickle.dump(index_dict, out)  # 索引字典
        pickle.dump(word_vectors, out)  # 词向量字典
        out.close()

        # test
        model.findSynonyms("你", 3).show()
Пример #17
0
 def _get_train_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (1, Vectors.dense([1, 2, 3]), 1.0),
         (2, Vectors.dense([1, 2, 3]), 0.0),
         (3, Vectors.dense([1, 2, 3]), 1.0),
         (4, Vectors.dense([1, 2, 3]), 0.0),
     ]
     return sql_context.createDataFrame(l, ['id', 'features', 'label'])
Пример #18
0
def process_json(filename, sparkcontext):

    sqlContext = SQLContext(sparkcontext)
    df = sqlContext.read.json(filename).select("title")
    output_list = get_counts(df)
    columns = ["token", "count"]
    output_df = sqlContext.createDataFrame(output_list, columns)
    output_df.write.mode('overwrite').parquet(
        filename.replace(".json", ".parquet"))
Пример #19
0
 def _get_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (
         "I dont know why people think this is such a bad movie.",
         Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0})
         ),
     ]
     return sql_context.createDataFrame(l, ['text', 'features'])
Пример #20
0
def main():
    sc = SparkContext()
    sqlctx = SQLContext(sc)

    lines = sc.textFile("alerts.csv").map(lambda l: l.split(","))
    alerts_rdd = lines.map(lambda l: Row(ts=l[1], name=l[0]))
    df = sqlctx.createDataFrame(alerts_rdd)

    aa = AssociationAlgorithm(df, 'ts', 'name', sqlctx)
    aa.execute_algorithm()
Пример #21
0
def main():
    sc = SparkContext(appName='TextSimillarity')
    sqlcont = SQLContext(sc)
    rdd = sc.textFile("test.csv")
    header = rdd.first()
    newrdd = rdd.filter(lambda x: x!= header)\
   .map(lambda x: x.split(','))\
   .map(lambda x: Row(description_x = x[1], description_y = x[2]))
    new_df = sqlcont.createDataFrame(newrdd)
    calculate_simillarity(new_df)
Пример #22
0
def main(stock_list, seq_len, result_table):
    os.environ[
        'PYSPARK_PYTHON'] = '/Users/lex/miniconda2/envs/pysparkenv2/bin/python'
    os.environ[
        'PYSPARK_DRIVER_PYTHON'] = '/Users/lex/miniconda2/envs/pysparkenv2/bin/python'

    fields = [
        StructField('open', FloatType(), True),
        StructField('high', FloatType(), True),
        StructField('low', FloatType(), True),
        StructField('close', FloatType(), True),
        StructField('volume', FloatType(), True),
        StructField('date', StringType(), True),
        StructField('ticker', StringType(), True),
    ]
    schema = StructType(fields)
    stock_list = stock_list.split(',')

    stock_data = pd.DataFrame()
    print('Predicting %s stocks' % len(stock_list))
    for x in stock_list:
        df = web.DataReader(x, 'morningstar', pd.datetime(2013, 4, 13),
                            pd.datetime(2018, 4, 13))
        stock_data = stock_data.append(df)
        print('Have the data')

    len_comb = len(stock_list)

    seq_len = int(seq_len)

    lstm = LstmStockTrainer(stock_data, seq_len)

    keys = stock_list

    conf = SparkConf().setAppName('portfolio_chooser')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    keys = sc.parallelize(keys)
    task_rdd = keys.map(lambda stock: lstm.predict_stocks(stock)) \
                .repartition(len_comb)
    result_rdd = task_rdd \
        .flatMap(lambda r: r.values) \
        .map(lambda r: tuple(r))

    result_df = sqlContext.createDataFrame(result_rdd, schema)
    # replacing all pandas NaNs to null
    cols = [
        func.when(~func.col(x).isin("NaN"), func.col(x)).alias(x)
        for x in result_df.columns
    ]
    result_df = result_df.select(*cols)
    result_df.show(5)
    util.write_small_df(result_df, result_table)
    return result_df
Пример #23
0
def check_fit_params(sc, models):
    sqlc = SQLContext(sc)
    results_schema = StructType([
        StructField("classifier", StringType()),
        StructField("params", StringType()),
        StructField("auc", FloatType())
    ])
    results = sqlc.createDataFrame(sc.emptyRDD(), schema=results_schema)
    for classifier, model in models.items():
        for i, combination in enumerate(model.getEstimatorParamMaps()):
            params = ["%s: %s" % (p.name, str(v))
                      for p, v in combination.items()]

            param_results = sc.parallelize(
                [(classifier, "-".join(params), model.avgMetrics[i])])
            param_results = sqlc.createDataFrame(
                param_results, schema=results_schema)

            results = results.union(param_results)

    results.coalesce(1).write.csv("fit-metrics", header=True)
Пример #24
0
def main():

    sc = SparkContext("local", "Query 1")

    rawWeather, weatherHeader, cities = run.getRDDFromCSV(
        sc, Constants.WEATHER_DESCRIPTION_FILE)

    weatherDescription = rawWeather \
        .subtract(weatherHeader) \
        .filter(lambda l: re.search('^\d{4}-03|^\d{4}-04|^\d{4}-05', l))  # month filter

    daysOfMonth = weatherDescription \
        .flatMap(lambda line: generateTuple(line, cities))

    sqlc = SQLContext(sc)

    df = sqlc.createDataFrame(daysOfMonth)
    df.show()
    df.createOrReplaceTempView("dati")
    query1 = "SELECT city, year, month, day, sum(sunny) as n_sunny_h FROM dati GROUP BY city, year, month, day"
    df2 = sqlc.sql(query1)
    df2.show()

    #applicazione regola sunny day (75%)
    df2.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, month, day FROM dati where n_sunny_h >13"
    df3 = sqlc.sql(query2)
    df3.show()

    #almeno 15 gg sereno al mese
    df3.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, month, count(*) AS n_day FROM dati GROUP BY year,city, month"
    df4 = sqlc.sql(query2)
    df4.show()

    #filtra n_giorni
    df4.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, month, n_day FROM dati where n_day>=15"
    df5 = sqlc.sql(query2)
    df5.show()

    # filtra n mesi = 3
    df5.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, count(*) AS n_month FROM dati GROUP BY city, year "
    df6 = sqlc.sql(query2)
    df6.show()

    df6.createOrReplaceTempView("dati")
    query2 = "SELECT city, year FROM dati WHERE n_month = 3 "
    df7 = sqlc.sql(query2)
    df7.show()
Пример #25
0
def func2(rdd):
    '''
        利用之前的知识,
        我们回顾下:
        从rdd创建dataframe
        dataframe创建表
        使用sql
        返回dataframe进行各种保存
    '''
    sqlContext = SQLContext(rdd.context)
    newrdd = rdd.map(lambda line: [line])
    df = sqlContext.createDataFrame(newrdd, StructType([StructField(name="content", dataType=StringType())]))
    df.createOrReplaceTempView("data")
    sqlContext.sql("select content from data").show()
Пример #26
0
        def prediction_wrapper(net):
            def prediction_map_func(row):
                cols_map = {}
                for col in column_names:
                    cols_map[col] = row[col]
                bmu, bmu_idx = find_bmu(row['features'], net)
                cols_map["bmu"] = Vectors.dense(bmu[0])
                cols_map["bmu_idx"] = Vectors.dense(bmu_idx)
                return Row(**cols_map)

            rdd_prediction = df.rdd.map(lambda row: prediction_map_func(row))
            # getting existing sparkContext
            sc = SparkContext.getOrCreate()
            sqlContext = SQLContext(sc)
            return sqlContext.createDataFrame(rdd_prediction)
def naive_bayes_classify(comment_preprocessed):
    sc = SparkContext(appName="Classification")
    sql_context = SQLContext(sc)
    data = sql_context.createDataFrame(comment_preprocessed)

    train, test = data.randomSplit([0.7, 0.3], 1234)
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
    model = nb.fit(train)

    predictions = model.transform(test)
    evaluate_classification(predictions)

    time.sleep(1)
    # predict_comment(sql_context, model)
    compare_classification_with_tool(sql_context, model)
Пример #28
0
def get_rdd_from_df(df):
    """
    takes a pandas df and returns a spark RDD
    """
    from pyspark import SparkContext, SQLContext
    from pyspark.mllib.linalg import Vectors
    sc = SparkContext.getOrCreate()
    from warnings import warn
    warn("get_rdd_from_df creates a spark context, it is recommended"
         " that you use SparkContext.getOrCreate() to prevent multiple context"
         " creation")
    sqlContext = SQLContext(sc)
    spark_df = sqlContext.createDataFrame(df)
    rdd = spark_df.rdd.map(
        lambda data: Vectors.dense([float(x) for x in data]))
    return rdd
def main(account_name, account_key):
    sc = SparkContext()
    sqlContext = SQLContext(sc)

    patient_records_container = 'patientrecords'
    glucose_levels_container = 'glucoselevelsaggs'
    preds_container = 'predictions'

    blob_service = BlobService(account_name=account_name, account_key=account_key)
    blob_service.create_container(preds_container)
    
    day_to_predict = get_most_recent_date(blob_service, glucose_levels_container)
    df = get_df_from_blob(blob_service, glucose_levels_container, patient_records_container, day_to_predict)
    
    project_path = 'wasb://model@{}.blob.core.windows.net/{}'
    si_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'si_pipe_model'))
    oh_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'oh_pipe_model'))
    model = RandomForestClassificationModel.read().load(path=project_path.format(account_name, 'model'))
    
    df_spark = sqlContext.createDataFrame(df)
    df_preds = si_pipe_model.transform(df_spark)
    df_preds = oh_pipe_model.transform(df_preds)
    
    num_var_names = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
                     'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'glucose_min',
                     'glucose_max', 'glucose_mean', 'glucose_var']
    cat_var_names = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id',
                     'admission_source_id', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'metformin',
                     'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
                     'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
                     'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
                     'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'diag_1_missing',
                     'diag_2_missing', 'diag_3_missing', 'race_missing', 'weight_missing', 'payer_code_missing',
                     'medical_specialty_missing']
    va = VectorAssembler(inputCols=(num_var_names + [c + "__encoded__" for c in cat_var_names]), outputCol='features')
    df_preds = va.transform(df_preds).select('features')
    
    df_preds = model.transform(df_preds)
    df_preds_pandas = df_preds.toPandas()
    df_preds_pandas = pd.concat([df[['patient_nbr', 'discharge_date']],
                                 df_preds_pandas['probability'].map(lambda x: x[1])], axis=1)
    
    # Save the predictions
    blob_service.put_block_blob_from_text(blob_name='-'.join(str(day_to_predict).split('/')) + '.csv',
                                          container_name=preds_container,
                                          text=df_preds_pandas.to_csv(index=False))
    return
Пример #30
0
def simple_test_dataframe(sc: SparkContext):
    py_data = read_data("data/energy_agg_test.json")
    rdd = sc.parallelize(py_data)
    sqlContext = SQLContext(sc)

    schema = StructType([
        StructField('energy', DoubleType(), nullable=False),
        StructField('energy', DoubleType(), nullable=False),
        StructField('energy', DoubleType(), nullable=False),
        StructField('energy', DoubleType(), nullable=False),
        StructField('energy', DoubleType(), nullable=False),
        StructField('energy', DoubleType(), nullable=False),
        StructField('energy', DoubleType(), nullable=False)
    ])

    df = sqlContext.createDataFrame(py_data)
    print_type_value(df)
Пример #31
0
def get_result(function, param=None):
    pandas_dataframe = get_requireddataframe_fromcsv(
        'Latest_women_shoes.csv', ['id', 'brand', 'colors', 'dateAdded'])
    conf = SparkConf().setAppName('Women Catalog')
    sc = SparkContext(conf=conf)
    # df2 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('sample.csv')
    #used pandas dataframe as using the above the file could not be located.
    sqlContext = SQLContext(sc)
    spark_dataframe = sqlContext.createDataFrame(pandas_dataframe)
    #data=spark_dataframe.select("*").toPandas()

    result_spark_dataframe = getattr(sys.modules[__name__],
                                     function)(spark_dataframe, param)

    result_python_dataframe = result_spark_dataframe.toPandas()
    result_dict = result_python_dataframe.to_dict('records')
    sc.stop()
    return result_dict
def multilayer_perceptron_classify(comment_preprocessed):
    sc = SparkContext(appName="Classification")
    sql_context = SQLContext(sc)
    data = sql_context.createDataFrame(comment_preprocessed)

    train, test = data.randomSplit([0.7, 0.3], 1234)
    layers = [len(comment_preprocessed[0].features), 11, 2]
    # sqrt(2000) = 45, sqrt(4000) = 63, log(2000, 2) = 11
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layers,
                                             blockSize=128,
                                             seed=1234)
    model = trainer.fit(train)
    predictions = model.transform(test)
    evaluate_classification(predictions)

    time.sleep(1)
    # predict_comment(sql_context, model)
    compare_classification_with_tool(sql_context, model)
Пример #33
0
def sample_function(sc: SparkContext):
    schema = StructType([StructField("odd_numbers", IntegerType(), True)])

    print(" Odds number sample")
    big_list = range(10)
    rdd = sc.parallelize(big_list, 2)
    odds = rdd.filter(lambda x: x % 2 != 0)
    odds.foreach(my_print)
    sql_context = SQLContext(sc)
    odd_numbers = sql_context.createDataFrame(odds.map(lambda _: Row(_)),
                                              schema)
    odd_numbers.printSchema()
    odd_numbers.show(truncate=False)
    print("odd_numbers count:" + str(odd_numbers.count()))

    odd_numbers.createOrReplaceTempView("odd_numbers_table")
    sql_context.sql("select * from odd_numbers_table limit 2;").show()

    return (odd_numbers)
    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
    schema = StructType(fields)

    rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None)
    # rdd1.foreach(printx)
    # sc.stop()

    ret_df = sqlContext.createDataFrame(rdd1, schema)
    ret_df.registerTempTable("loginflowlog_overall")
    _sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \
           "from loginflowlog_overall " \
           "group by logtype"
    rs_df = sqlContext.sql(_sql)

    service = LoginflowlogMysqlService()
    ret_overall_list = service.getRetOverall(rs_df.collect(), day)
    _sql_delete = "delete from login_flow_global_count where date ='%s'" % day
    _sql_insert = "insert into login_flow_global_count(date," \
                  "prelogin_num,prelogin_pnum,login_num,login_pnum," \
                  "login_click_num,login_click_pnum,forward_num,forward_pnum," \
                  "preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \
                  "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
Пример #35
0
from pyspark.sql import SQLContext,Row
#from pyspark.sql import Functions as F

dataDir = "/home/rsk/Documents/Spark"

userData = sc.textFile(dataDir+"/ml-100k/u.user").map(lambda x : x.split("|"))
movieData = sc.textFile(dataDir+"/ml-100k/u.item").map(lambda x : x.split("|"))
ratingData = sc.textFile(dataDir+"/ml-100k/u.data").map(lambda x : x.split("\t"))

#%%

ratingDataDF = ratingData.map(lambda x : Row(userID = int(x[0]),
                        movieID = int(x[1]),
                        rating=float(x[2]),
                        timestamp = int(x[3])))
ratingDataDF = sqlContext.createDataFrame(ratingDataDF)

userDataDF = userData.map(lambda x : Row(userID=int(x[0]),
                                        age = int(x[1]),
                                        gender = x[2],
                                        occupation = x[3],
                                        zipcode = x[4]))
userDataDF = sqlContext.createDataFrame(userDataDF)

movieDataDF = movieData.map(lambda x : Row(movieID = int(x[0]),
                                            movieTitle = x[1],
                                            releaseDate = x[2],
                                            videoReleaseDate = x[3],
                                            IMDBurl = x[4],
                                            unknown= int(x[5]),
                                            action = int(x[6]),
Пример #36
0
# setup
#

import numpy as np

# create random data
n = 52
prices = [float(list(5 + abs(np.random.randn(1)) * 100)[0]) 
	for i in range(n)]
dates = [datetime(year=np.random.randint(2000, 2016), 
	month=np.random.randint(1, 12), 
	day=np.random.randint(1, 28)).date() for i in range(n)]
groups = [np.random.randint(1, 100) for i in range(n)]
data = [{"price": price, "date": _date, "group": group} 
	for price, _date, group in zip(prices, dates, groups)]
df = sqlContext.createDataFrame(data)

print('df initial')
df.show()

# convert to rdd of dicts
rdd = df.rdd
rdd = rdd.map(lambda x: x.asDict())

#
# get deciles
#

total_num_rows = rdd.count()
column_to_decile = 'price'
Пример #37
0
    #(u'2015-48_6C25B958F2CC_175', u'2015120120')
    #rdd1.foreach(my_print)
    #(u'2015-50_7014A62FA5B0_0', [u'22',u'23'])
    rdd1_2 = rdd1_1.groupByKey().mapValues(list).sortByKey().map(times_count_first)
    #(u'2015-48_903C920CAE97_655', [u'15_1'])
    #rdd1_2.foreach(my_print)

    rdd2_1 = df.rdd.map(convert_kv_last)
    rdd2_2 = rdd2_1.groupByKey().mapValues(list).sortByKey().map(times_count_last)

    rdd3 = rdd1_2.join(rdd2_2).map(convert_rets).values().flatMap(list)
    #(u'2015', u'48', u'A09347EC9FBB', u'189', u'13', u'1', u'14', u'1')
    rdd3.foreach(my_print)
    logger.info(rdd3.count())
    fields = [
        StructField('year', StringType(), True),
        StructField('week', StringType(), True),
        StructField('mac', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('firstTime', StringType(), True),
        StructField('firstCount', LongType(), True),
        StructField('lastTime', StringType(), True),
        StructField('lastCount', LongType(), True)
    ]
    schema = StructType(fields)

    df1 =  sqlContext.createDataFrame(rdd3,schema)
    df1.coalesce(2).write.parquet(output,'overwrite')


    sc.stop()
def main():
    conf = SparkConf().setAppName("climate")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    climateSchema = StructType(
        [
            StructField("station", StringType(), False),
            StructField("date", IntegerType(), False),
            StructField("element", StringType(), False),
            StructField("value", IntegerType(), True),
            StructField("mflag", StringType(), True),
            StructField("qflag", StringType(), True),
            StructField("sflag", StringType(), True),
            StructField("obstime", StringType(), True),
        ]
    )
    info = sqlContext.read.format("com.databricks.spark.csv").options(header="false").schema(climateSchema).load(inputs)
    info.registerTempTable("info")
    stationinfo = sqlContext.sql("SELECT station, date, element, value, FLOOR(date/10000) as yy FROM info ")
    stationinfo.registerTempTable("stationinfo")
    stationinfo.cache()

    prcpTable = sqlContext.sql("SELECT station, date, value as prcp, yy FROM stationinfo WHERE element='PRCP' ")
    prcpTable.registerTempTable("prcpTable")
    prcpTable.cache()
    # prcpTable.show()

    # create 3 tables that hold the monthly average of min, max temperature and prcp
    yearlyprcp = sqlContext.sql(
        "SELECT station, yy, ROUND(Avg(prcp),0) as avg_prcp FROM prcpTable GROUP BY station, yy "
    )
    yearlyprcp.registerTempTable("prcpMean")
    # yearlyprcp.show()

    # get information about stations from stations.txt

    def getdata(line):
        line = line.split("  ")
        values = [x.strip() for x in line]
        return values

    stations = sc.textFile(input2)
    stations = stations.map(getdata)
    stations = stations.map(lambda (a, b, c): Row(station=a, latitude=float(b), longitude=float(c))).cache()
    stationDF = sqlContext.createDataFrame(stations)
    stationDF.registerTempTable("StationTable")
    stationDF.cache()

    # param = sqlContext.sql("SELECT MAX(latitude) as max_lat, Min(latitude) as min_lat, MAX(longitude) as max_long, Min(longitude) as min_long FROM StationTable")
    # param.show()

    # Join to station file to add latitude and longitude and stationID
    result = (
        stationDF.join(yearlyprcp)
        .where(stationDF.station == yearlyprcp.station)
        .select(yearlyprcp.avg_prcp, yearlyprcp.station, yearlyprcp.yy, stationDF.latitude, stationDF.longitude)
    )

    # save into parquet file
    result.write.format("parquet").save(output)
Пример #39
0
if __name__ == "__main__":
    file_path = os.path.abspath("../doc/book.txt")
    print file_path

    conf = SparkConf().setAppName("schema_test").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    lines = sc.textFile(file_path)
    # 切分
    parts = lines.map(lambda lines: lines.split(","))

    # 隐射表间关系(定义表结构)
    book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))

    # 转换成schema并注册
    schemaPeople = sqlContext.createDataFrame(book)
    schemaPeople.registerTempTable("book")

    # 定义sqk语句(查询prize在50、60之间的书)
    book = sqlContext.sql("SELECT * FROM book WHERE price > 50.0 AND price < 60 OR name LIKE '%Spark%'")

    # 查询结果进行隐射
    bookMap = book.map(lambda books: (books.name, books.author, books.price, books.publish))

    for book in bookMap.collect():
        print "|Name: " + book[0], "|Author: " + book[1], "|Price: " + str(book[2]), "|Publish: " + book[3] + "|"

    sc.stop()
Пример #40
0
    download_flow(*) upload_flow(*) os browser ratio
    batch_no user_type supp_id
    '''
    user_login = parts.map(lambda p: (p[1].strip(), p[2].strip(),p[17].strip(),p[3].strip(),p[16].strip(),
                                  p[4].strip(),p[5].strip(),p[6].strip(),p[7].strip(),p[8].strip(),
                                  p[9].strip(),p[10].strip(),p[11].strip(),p[12].strip(),p[13].strip(),
                                  p[14].strip(),p[15].strip()))
    schema_string = "id gw_id supp_id user_id user_type " \
                   "user_name login_time logout_time mac ip " \
                   "user_agent download_flow upload_flow os browser " \
                   "ratio batch_no"

    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)

    df = sql_context.createDataFrame(user_login, schema)
    df.registerTempTable("tb_user_login_info")

    #_sql="select distinct mac,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date
    _sql="select distinct user_name,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date
    rs = sql_context.sql(_sql)
    re_rdd = rs.map(lambda r:(r.mac+"_"+r.gw_id,r.day+sep+r.flag))\
        .reduceByKey(lambda vs:vs[0],1)

    list =[]
    for t in re_rdd.collect():
        line = t[0]+sep+t[1]
        list.append(line)

    BaseService()._write_file(list, output)
Пример #41
0
# -*- coding: utf-8 -*-
__author__ = 'wxmimperio'

from pyspark import SparkContext, SparkConf
from pyspark import SQLContext, Row
import os

if __name__ == "__main__":
    file_path = os.path.abspath("../doc")

    conf = SparkConf().setMaster("local[2]").setAppName("schema_merging")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # 创建DataFrame
    df1 = sqlContext.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i * 2)))
    df1.write.parquet(file_path + "/result/key=1")

    df2 = sqlContext.createDataFrame(sc.parallelize(range(6, 11)).map(lambda i: Row(single=i, triple=i * 3)))
    df2.write.parquet(file_path + "/result/key=2")

    df3 = sqlContext.read.option("mergeSchema", "true").parquet(file_path + "/result")
    df3.printSchema()

    # print df3.collect()
    for row in df3.collect():
        print "single=" + str(row[0]), "triple=" + str(row[1]), "double=" + str(row[2]), "key=" + str(row[3])

    sc.stop()
# Load and parse the data
# line format: (station, latitude, longitude,)


def parsePoint(line):
    return LabeledPoint(line[0], line[1:])

# read data from station file
def  getdata(line):
        line = line.split('  ')
        values = [x.strip() for x in line]
        return values
stations = sc.textFile(input)
stations = stations.map(getdata)
stations = stations.map(lambda (a,b,c): (float(hash(a)), int(year), float(b), float(c))).cache()
stationsDF = sqlContext.createDataFrame(stations)

# create dataset to fit into model
parseData = stations.map(parsePoint)

# load the model
sameModel = LinearRegressionModel.load(sc, myModelPath)

# run the model
stationidAndPreds = parseData.map(lambda p : (p.label,  float(sameModel.predict(p.features))))
stationidAndPredsDF = sqlContext.createDataFrame(stationidAndPreds)

# the result returns a predicted value for each station (stationId) in the given year
# joining the stations rdd with stationidAndPreds to find the latitude and longitude of each station
result = stationsDF.join(stationidAndPredsDF).where(stationidAndPredsDF[0]==stationsDF[0]).select(stationidAndPredsDF[1], stationsDF[2], stationsDF[3])
Пример #43
0
            .setAppName("adhoscount")
            .set("spark.kryoserializer.buffer.mb", "256")
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)

    _adloadDF=sqlContext.read.parquet(adLoadFiles)
    _adloadRdd=_adloadDF.rdd.map(lambda x:(x.guuid,x.hosid)).groupByKey().map(fetchOne)

    fields = [
        StructField('guuid', StringType(), True),
        StructField('hosid', StringType(), True),
        ]
    schema = StructType(fields)
    schemaDest = sqlContext.createDataFrame(_adloadRdd, schema)
    schemaDest.registerTempTable("ghid")

    _adloadDF.registerAsTable("adload")
    sqlContext.read.parquet(adPlayFiles).registerAsTable("adplay")
    sqlContext.read.parquet(adClickFiles).registerAsTable("adclick")

    '''
    _adLoadDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766},
        {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
        {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
    ]).registerAsTable("adload")
    _adPlayDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
Пример #44
0
"""
from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PipelineExample")
    sqlContext = SQLContext(sc)

    # $example on$
    # Prepare training documents from a list of (id, text, label) tuples.
    training = sqlContext.createDataFrame([
        (0L, "a b c d e spark", 1.0),
        (1L, "b d", 0.0),
        (2L, "spark f g h", 1.0),
        (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled (id, text) tuples.
    test = sqlContext.createDataFrame([
from pyspark import SparkContext, SQLContext
# $example on$
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="EstimatorTransformerParamExample")
    sqlContext = SQLContext(sc)

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = sqlContext.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
    # LogisticRegression instance.
Пример #46
0
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)

    df = sqlContext.read.parquet(logFile)
    destDF=df.select('logintype','logtype','hosid','suppid','logtime','usermac','gwid').map(lambda x:trimf(x))
    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True),
        StructField('gwid', StringType(), True)
        ]
    schema = StructType(fields)
    schemaDest = sqlContext.createDataFrame(destDF, schema)
    schemaDest.registerTempTable("loginflowlog")

    sqlContext.registerFunction("todatestr", lambda x:longTime2str(x),StringType())
    sqlContext.registerFunction("trimx", lambda x:trimx(x),StringType())
    midDF = sqlContext.sql("select count(1) userlogintimes,count(distinct(usermac)) userlogincount,hosid,gwid,todatestr(logtime) day from loginflowlog "
                           "where logtype like '5-%-arrive' and gwid!='' group by hosid,gwid,todatestr(logtime)")

    hosiddayList=midDF.rdd.map(lambda x:(x[2],x[3],x[4],x[2],x[3],x[4])).collect()
    resultList=midDF.rdd.collect()

    dao=MysqlDao()
    dao.insertMany('INSERT INTO `bblink_data`.`bblink_data_hos_subject` (`hosid`,`gwid`,`day`)VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE hosid=%s,gwid=%s,day=%s',hosiddayList)


    dao.insertMany("update `bblink_data`.`bblink_data_hos_subject` set userlogintimes=%s,userlogincount=%s where hosid=%s and gwid=%s and day=%s",resultList);
Пример #47
0
def applyModel(fileName, loadModelName, outlierPercentile = 100):

    sc = SparkContext( 'local', 'pyspark')
    sqlContext = SQLContext(sc)

    #########
    # load data
    #########

    data = sc.textFile(fileName)
    #extract header and remove it
    header = data.first()
    data = data.filter(lambda x:x !=header).cache()
    header = header.split('\t')
    #parse data
    data = data.map(lambda x : x.split('\t'))

    #########
    # prepare features
    #########

    df = sqlContext.createDataFrame(data, header)
    df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float'))
         .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int'))
         .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int'))
          .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int'))
         )
    thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile)
    df = df.filter(func.col('ADLOADINGTIME') < thr)
    df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH"))
    df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int'))
    df = df.withColumn("COMBINEDID", 
            func.concat(
                func.col('ACCOUNTID'), 
                func.col('CAMPAIGNID'), 
                func.col('CREATIVEID'), 
                func.col('SDK')) )

    #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA'))

    df = df.withColumn("COMBINEDEXTERNALID", 
            func.concat( 
                func.regexp_replace('EXTERNALADSERVER', 'null', ''), 
                func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), 
                func.regexp_replace('EXTERNALSITEID', 'null', ''), 
                func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') ))

    #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA'))

    df = df.withColumn("PLATFORMCOMBINED", 
            func.concat( 
                func.regexp_replace('PLATFORM', 'null', ''), 
                func.regexp_replace('PLATFORMVERSION', 'null', '') ))

    #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA'))

    df = df.withColumn("UA_OSCOMB", 
            func.concat( 
                func.regexp_replace('UA_OS', 'null', ''), 
                func.regexp_replace('UA_OSVERSION', 'null', '') ))

    #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA'))
    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON', '[^,\d]', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', '^,', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', ',,', ',') )

    udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType())
    df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE"))

    print('Loaded and prapared %d entries' % df.count())

    #########
    # keep only needed features
    #########   

    features = ['ADLOADINGTIME',
     'PLACEMENTID',
     'TIMESTAMP',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'TOPMOSTREACHABLEWINDOWAREA',
     'FILESJSON_SIZE',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    df = df.select(features)

    #########
    # Convert categorical features to numerical
    #########   


    featuresCat = [
     'PLACEMENTID',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    for i in range(len(featuresCat)):

        indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
        df = indexer.transform(df).drop(featuresCat[i])
        writer = indexer._call_java("write")
        writer.overwrite().save("indexer_" + featuresCat[i])    

    featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]    

    features = featuresCat[:]
    features.append('TIMESTAMP')    
    features.append('FILESJSON_SIZE')
    features.append('TOPMOSTREACHABLEWINDOWAREA')


    #########
    # Assemble features
    #########   


    assembler = VectorAssembler(
        inputCols=features,
        outputCol="features")

    df = assembler.transform(df)

    #########
    # Convert to labeled point
    #########   


    lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
      .map(lambda row: LabeledPoint(row.label, row.features)))
    lp.cache()


    #########
    # Load trained model
    #########
    
    model = RandomForestModel.load(sc, loadModelName)
    
    print('Model loaded!')
    
    predictions = model.predict(lp.map(lambda x: x.features)).collect()
    
    return predictions
Пример #48
0
        StructField('mac', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('loginPage', IntegerType(), False),
        StructField('forwardPage', IntegerType(), False),
        StructField('arrivePage', IntegerType(), False)
    ]
    schema = StructType(fields)

    # compute pages
    rdd1 = rdd.map(convert_logtime)\
        .map(convert_kv)\
        .groupByKey().mapValues(list).map(convert_set)
    #(u'20151201_74:AD:B7:78:03:86_119', set([u'1-prelogin', u'2-mobile-login']))

    rdd1_2 = rdd1.map(convert_visitpage)
    df1 =  sqlContext.createDataFrame(rdd1_2,schema)
    #.registerTempTable("mid_uservisitpage_day")
    _output = output+"/mid_uservisitpage_day/dat=%s" % day
    df1.coalesce(2).write.parquet(_output,'overwrite')


    # compute times
    rdd2 = rdd.map(convert_kv2).groupByKey().mapValues(list).map(convert_sort)
    # (u'20151201_38AA3C3DBC12_127', ['2015120119', '2015120121'])
    rdd2_2 = rdd2.map(convert_days)
    #rdd2_2.foreach(my_print)
    fields = [
        StructField('day', StringType(), True),
        StructField('mac', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('firstTime', StringType(), True),
Пример #49
0
    nn_gridsearch.debug('-'*40)
    nn_gridsearch.debug('Execution time: %s' % str(datetime.now()))

    # with open('~/.aws/credentials.json') as f:
    #     CREDENTIALS = json.load(f)

    sc = set_spark_context()

    conn = S3Connection()
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='rdd.pkl')


    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
            .sample(withReplacement=False, fraction=.5, seed=1)
    df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
    train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
    results = []

    num_features = 5000
    min_doc_freq = 20
    layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]

    for l in layers:
        remover = StopWordsRemover(inputCol="raw", outputCol="words")
        hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                              numFeatures=num_features)
        tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                    outputCol="features", minDocFreq=min_doc_freq)
        indexer = StringIndexer(inputCol="string_label", outputCol="label")
Пример #50
0
    with(SparkContext(appName='My Spark Application')) as sc:

        print 'It works!'

        sql_context = SQLContext(sc)

        # Allows it to work in parallel
        rdd = sc.parallelize([
            ('john', 1),
            ('tori', 2),
            ('alex', 3),
            ('julia', 4),
            ('chris', 5)
        ])

        # Combines keys together and add them up
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        # Make table schema with types
        schema = StructType([
            StructField('name', StringType()),
            StructField('price', IntegerType())
        ])

        # Pass RDD with data and schema
        df = sql_context.createDataFrame(rdd, schema)

        # print rdd.take(5)

        df.show()
        df.printSchema()
Пример #51
0
class Credit:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("CREDIT")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)
        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
        self.base = 'hdfs://master:9000/gmc/'

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df

    def sql_operate(self, sql, rdd, once_size=1000):
        temp = []
        for row in rdd.collect():
            # print(row)
            if len(temp) >= once_size:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()
            temp.append(row)

        if len(temp) != 0:
            self.mysql_helper.executemany(sql, temp)
            temp.clear()

    def prepare_fpgrowth_data(self):
        tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR',
                                                                                                 'MER_CAT_CD') \
            .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013")

        result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey()

        def m(x):
            k = x[0]
            l = list(x[1])

            v = set()
            for i in l:
                v.add(i[0])

            return set(v)

        result = result.map(m)
        for i in result.take(10):
            print(i)

        model = FPGrowth.train(result, minSupport=0.05, numPartitions=10)
        result = model.freqItemsets().collect()
        for r in result:
            print(r)

    def cycle_credit(self):
        '''
        信用卡聚类数据预处理
        :return:
        '''

        print('---------------------------信用卡-Start--------------------------')
        # 交易流水
        credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT',
                                                                           'BILL_AMTFLAG').filter(
            "BILL_AMTFLAG ='-'").cache()

        # 卡账户信息
        credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE')

        # 还款计算
        return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT')
        return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED'))

        # 去除0最低还款额,即未消费的账单月
        join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0')

        # 清除缓存
        self.sqlctx.clearCache()

        def which_cycle_type(line):
            mindue = line['STM_MINDUE']
            returned = line['RETURNED']

            '''
            0:normal,all returned
            1:cycle credit
            2:overdue,don't return money
            '''
            if mindue is not None and returned is None:
                flag = 2
            elif returned >= mindue * 10:
                flag = 0
            elif returned > mindue and returned < mindue * 10:
                flag = 1
            else:
                flag = 9

            return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag,
                       STM_MINDUE=line['STM_MINDUE'])

        # 返回为PipelinedRDD
        join = join.map(which_cycle_type)

        # 转为DataFrame
        join = self.sqlctx.createDataFrame(join)
        '''
        +-------+--------+-----+
        | ACCTNBR | DUE_FLAG | count |
        +-------+--------+-----+
        | 608126 |    2 |    1 |
        | 608126 |    0 |    6 |
        | 608868 |    0 |    4 |
        '''
        # 按还款类型分类
        each_type = join.groupBy(['ACCTNBR', 'DUE_FLAG'])

        # 计算每种还款情况数量
        each_type_count = each_type.count()

        # 计算每种还款情况的最低还款额之和
        each_type_mindue_sum = each_type.sum('STM_MINDUE')

        # 计算还款情况总数
        all_type_count = each_type_count.groupBy('ACCTNBR').sum('count')

        # join 上述三表
        rate = each_type_count.join(each_type_mindue_sum, ['ACCTNBR', 'DUE_FLAG'], 'outer').join(all_type_count,
                                                                                                 'ACCTNBR', 'outer')

        # print(rate.columns)
        # ['ACCTNBR', 'DUE_FLAG', 'count', 'sum(STM_MINDUE)', 'sum(count)']

        # 筛选出循环信用的数据
        # TODO 暂时只取了循环信用的

        rate = rate.filter(rate['DUE_FLAG'] == 1)

        # 计算进入循环信用的比例
        rate = rate.select('ACCTNBR',
                           (rate['sum(STM_MINDUE)'] * 10).alias('CYCLE_AMT'),
                           rate['count'].alias('CYCLE_TIMES'),
                           (rate['count'] / rate['sum(count)']).alias('CYCLE_RATE'))

        # rate.show()
        # print(rate.count())


        def m(line):
            return line['CYCLE_TIMES'], line['CYCLE_AMT'], line['CYCLE_RATE'], line['ACCTNBR']

        sql = "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=%s,CYCLE_AMT=%s,CYCLE_RATE=%s where ACCTNBR=%s"

        df = rate.map(m)

        print('将数据更新至数据库...')
        self.sql_operate(sql, df)

        # 将未进入循环的 设为0
        print('将未进入循环的 设为0...')
        self.mysql_helper.execute(
            "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=0,CYCLE_AMT=0,CYCLE_RATE=0 where CYCLE_TIMES is null ")

    def losing_warn(self,year,month):



        # #计算月份
        # if season == 1:
        #     months_now = [1,2,3]
        #     months_before = [10,11,12]
        #     year_before = year - 1
        # else:
        #     months_now = [season * 3 - 2, season * 3 - 1, season * 3]
        #     months_before = [season * 3 - 5, season * 3 - 4, season * 3-3]
        #     year_before = year
        #
        #
        #
        # # 抽取每个季度数据
        #
        # for m in months_now:

        # 最近一个月
        month = '%02d' % month


        for i in range(2,29):
            day = '%02d' % i
            date = str(year) + month + day
            print(date)

            sql = "select MONTH_NBR from t_CMMS_CREDIT_TRAN where INP_DATE = %s limit 1"

            try:
                month_nbr = self.mysql_helper.fetchone(sql,(date,))
                if month_nbr is None:
                    continue
                else:
                    month_nbr = int(month_nbr[0])
                    break
            except Exception:
                continue

        if month_nbr is None:
            raise Exception("There is no data in database for month:%s" % month)
        else:
            print('the latest month_nbr is %s' % month_nbr)


        months_now = [month_nbr-2,month_nbr-1,month_nbr]
        months_before = [month_nbr-5,month_nbr-4,month_nbr-3]

        print('months_now',months_now)
        print('months_before',months_before)


        # 交易流水
        credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'INP_DATE', 'MONTH_NBR', 'BILL_AMT',
                                                                           'BILL_AMTFLAG').filter("BILL_AMTFLAG ='+'").cache()



        # 筛选出两个季度对应的流水

        months_now_filter = None
        months_before_filter = None

        for i in months_now:
            f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i)
            if months_now_filter is None:
                months_now_filter = f
            else:
                months_now_filter = months_now_filter.unionAll(f)

        for i in months_before:
            f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i)
            if months_before_filter is None:
                months_before_filter = f
            else:
                months_before_filter = months_before_filter.unionAll(f)


        months_now_filter.groupBy('MONTH_NBR').count().show()
        months_before_filter.groupBy('MONTH_NBR').count().show()


        months_now_count = months_now_filter.groupBy('ACCTNBR').count()
        months_before_count = months_before_filter.groupBy('ACCTNBR').count()

        months_now_count.show()
        months_before_count.show()


        join = months_now_count.select('ACCTNBR',months_now_count['count'].alias('NOW_COUNT')).join(
            months_before_count.select('ACCTNBR',months_before_count['count'].alias('BEFORE_COUNT')),'ACCTNBR','outer'
        )

        join.show()

        def m(line):
            ncount = line['NOW_COUNT']
            bcount = line['BEFORE_COUNT']
            '''
            计算增长率
            9999:两季度均无数据
            8888:仅第一季度有数据,流失客户
            7777:仅第二季度有数据,新增客户
            其他数字:第二个季度较第一季度增长率 (s2-s1)/s1*100
            '''
            if ncount is None:
                if bcount is None:# n none,b none
                    pass
                else:# n none, b not none
                    increment = -9999
            else:
                if bcount is None:# n not none, b none
                    increment = 8888
                else:# n not none,b not none
                    increment = round((ncount-bcount)/bcount*100)

            '''
            计算信用卡生命周期(以增长率计算)
            100+ 快速增长
            50-100 增长
            -50-50 稳定
            -50- 衰退
            9999 流失

            '''
            if increment > 100 and increment != -9999:
                life = 1  # fast growing
            elif increment <=100 and increment >50:
                life = 2  # growing
            elif increment <= 50 and increment > -50:
                life = 3  # stable
            elif increment <= -50:
                life = 4  # losing
            else:
                life = 9  # no more tran  completely lost

            return line['ACCTNBR'],month_nbr,increment,life




        sql = "replace into t_CMMS_ANALYSE_CREDIT(ACCTNBR, MONTH_NBR, INCREMENT,LIFE, UPDATE_TIME) values(%s,%s,%s,%s,now())"

        rdd = join.map(m)
        print(type(rdd))

        self.sql_operate(sql,rdd)
Пример #52
0
    # uid,adid,guuid,createtime
    fields = [
        StructField('uid', StringType(), True),
        StructField('adid', StringType(), True),
        StructField('guuid', StringType(), True),
        StructField('guuidctime', LongType(), True),
        StructField('url', StringType(), True),

        StructField('referer', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('gwid', StringType(), True),
        StructField('ua', StringType(), True),
        StructField('ip', StringType(), True),

        StructField('createtime', LongType(), True),

    ]

    schema = StructType(fields)

    # [(),()] ['','']
    df_dest = sqlContext.createDataFrame(rdd, schema)
    df_dest.registerTempTable("back_portal_loginlog")

    #df_dest.rdd.foreach(my_print)
    # save
    df_dest.write.parquet(output)


    sc.stop()
Пример #53
0
parser.add_argument('deaths')
parser.add_argument('output')
args = parser.parse_args()

conf = SparkConf().setAppName("correlate")
sc = SparkContext(conf=conf)
sql = SQLContext(sc)

births_raw = sql.read.load(args.births).rdd
deaths_raw = sql.read.load(args.deaths).rdd

births = births_raw.map(to_joinable_on_id)
deaths = deaths_raw.map(to_joinable_on_id)

both = births.fullOuterJoin(deaths)
unjoined_births = both.filter(get_unjoined_births)
unjoined_deaths = both.filter(get_unjoined_deaths)
correctly_joined = both.filter(remove_unjoined_all).map(to_joined_format)

# do a join with jaro-winkler
jaro_input_births = unjoined_births.map(to_jaro_matching_input)
jaro_input_deaths = unjoined_deaths.map(to_jaro_matching_input)
jaro_input_all = jaro_input_births.cartesian(jaro_input_deaths)
jaro_joined = jaro_input_all.filter(jaro_match).map(cart_to_joined_format)

to_save = sql.createDataFrame(correctly_joined)
to_save.write.save(args.output + '/joined', format="parquet")

to_save = sql.createDataFrame(jaro_joined)
to_save.write.save(args.output + '/jaro_joined', format="parquet")
Пример #54
0
if __name__ == "__main__":
    conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # UDF自定义函数注册
    sqlContext.registerFunction("analysis_email", analysis_email)

    file_path = os.path.abspath("../doc/analysis.txt")
    lines = sc.textFile(file_path)

    info = lines.map(lambda lines: lines.split("----")). \
        map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
                             idcard=info[3], password=info[4], phone=info[5]))

    schemaInfo = sqlContext.createDataFrame(info)
    schemaInfo.registerTempTable("information")
    # cache表
    #sqlContext.cacheTable("information")
    #sqlContext.uncacheTable("information")

    """
    :邮箱分析与统计
    """
    email_str = "SELECT analysis_email(email) AS email FROM information"
    emailSQL = sqlContext.sql(email_str)
    # 求总数
    count = emailSQL.count()
    # 分组统计
    emailCollect = emailSQL.groupBy("email").count().collect()
    # email分析结果
Пример #55
0
def features_to_vec(length, entropy, alexa_grams, word_grams):
    high_entropy = 0.0
    high_length = 0.0
    if entropy > 3.5: high_entropy = 1.0
    if length > 30: high_length = 1.0
    return Vectors.dense(length, entropy, high_entropy, high_length, alexa_grams, word_grams)


#dga_domains = sc.textFile("/user/cloudera/dga.txt")
#dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x)))
#dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache()

words = sc.textFile("/user/cloudera/words.txt")
words = words.map(lambda x: (x, "dict", float(len(x)), entropy(x)))
words_df = sqlctx.createDataFrame(words, schema).dropna().distinct().cache()

dga_domains = sc.textFile("/user/cloudera/c_domains_*")
dga_domains = dga_domains.map(lambda x: (x, "dga", float(len(x)), entropy(x)))
dga_domains_df = sqlctx.createDataFrame(dga_domains, schema).dropna().distinct().cache()

alexa_domains = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load(
    'alexa_100k.csv')\
    .map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1])))
alexa_domains_df = sqlctx.createDataFrame(alexa_domains, schema).dropna().distinct().cache()

alexa_domains_1M = sqlctx.read.format('com.databricks.spark.csv').options(header='false', inferschema='true').load(
    'alexa_1M.csv')\
    .map(lambda x: (x[1], "legit", float(len(x[1])), entropy(x[1])))
alexa_domains_1M = sqlctx.createDataFrame(alexa_domains_1M, schema).distinct().cache()
Пример #56
0
mirror_dir = "data/mirror"
data_dir = "data/data-{0}".format(dataset_date)
out_dir = "data/bhl-{0}.parquet".format(dataset_date)

if os.path.isdir(out_dir):	
    print("Output dir {0} exists".format(out_dir))
    exit


get_ocr_udf = sql.udf(get_ocr, types.StringType())
fn = os.path.join(data_dir, "item.txt")

# Optional limit for testing, add this to the chain as second step
# .sample(withReplacement=False, fraction=0.001) \
sqlContext.createDataFrame(t_gen(fn, type_data_item), schema_item()) \
    .withColumn("ocrtext", get_ocr_udf(sql.col("barcode"))) \
    .write.parquet(out_dir)


# Example run on Elk (16 thread single machine)
#real    84m21.818s
#user    198m57.612s
#sys     15m19.662s

# Example run on okapi (128 thread single machine)
#real    41m13.984s
#user    482m34.084s
#sys     278m12.404s

  indexes = [1,2,7,11,5,8,20]
  return [ record[i].replace('"','') for i in indexes]

def filterData(record):
  flag = True
  if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False
  return flag

if __name__ == '__main__':
  sc = SparkContext(appName = 'CF_prod_in_transaction')
  sqlContext = SQLContext(sc)
  in_file = sc.textFile(sys.argv[1])
  data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]])
  Record = Row('customer_id','product_id','invoice_id','units')
  data = data.map(lambda x: Record(*x))
  data = sqlContext.createDataFrame(data)
  sqlContext.registerDataFrameAsTable(data,'table1')
  df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id')
  df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2])
  sc.stop()



data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path
write = open('test.csv','w')
wrtr = csv.writer(write)

import csv
read = open('arqiva.csv')
for line in read: wrtr.writerow(line)
Пример #58
0
def main(argv):
    # list of words to look for!
    GODWINS_WORDS = ['hitler', 'nazi']

    # setup inputs and outputs
    input_directory = argv[0]
    output_directory = argv[1]

    # spark specific setup
    conf = SparkConf().setAppName('godwin whaaa')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # read input
    text = sc.textFile(input_directory)
    text = text.repartition(200)

    # convert to magic json formatting
    loadedJson = text.map(lambda line: json.loads(line))

    # make the json skinnier by removing unwanted stuff
    fullRedditJson = loadedJson.map(lambda jObj: (jObj['body'], jObj['name'], jObj['parent_id'])).cache()

    # code from greg for regex to parse lines
    linere = re.compile(regex_from_words(GODWINS_WORDS))

    # now filter out stuff without GODWINS_WORDS "body","id", "subreddit", "parent_id" 
    godwinJsonList = fullRedditJson.filter(lambda (body, name, parent_id): linere.match(body.lower()))
    
    # We don't need the comment body anymore...
    # We need to find the paths now...
    godwin_node_rdd = godwinJsonList.map(row_into_node).cache()
    full_node_rdd = fullRedditJson.map(row_into_node)

    # we also need a list of node names so we can later check if we already visited it.
    godwinNodes = godwin_node_rdd.map(lambda (name, parent_id): name)

    # Convert full data RDD into SQL Data Frame
    subredditSchema = StructType([
        StructField("name", StringType(), True),
        StructField("parent_id", StringType(), True)
    ])
    full_node_df = sqlContext.createDataFrame(full_node_rdd, subredditSchema)

    # Convert godwin rows RDD into SQL Data Frame
    godwinSchema = StructType([
        StructField("g_name", StringType(), True),
        StructField("g_parent_id", StringType(), True)
    ])
    godwin_node_df = sqlContext.createDataFrame(godwin_node_rdd, godwinSchema).cache()

    count_down = godwin_node_df.count()
    print 'There are', count_down, 'comments with a godwins word'
    depth = 0
    nodes_per_depth = {}
    visited_node_list_df = godwin_node_df.select(godwin_node_df.g_name)
    print 'visited_node_list_df'
    print str(visited_node_list_df.count())
    
    while count_down > 0 and depth < 100:

        depth += 1
        # Join find next layer of nodes
        joined_df = godwin_node_df.join(full_node_df,
                                        [godwin_node_df['g_parent_id'] == full_node_df['name']])
        
        # Drop the columns of the older node
        next_node_df = joined_df.select(
            joined_df['name'].alias('g_name'),
            joined_df['parent_id'].alias('g_parent_id')).cache()
        print 'next_node_df count: '+str(next_node_df.count())
        
        # Select only the ones that have NOT been visited
        # TODO: is there a better way?
        leftt = next_node_df.join(visited_node_list_df, next_node_df.g_name == visited_node_list_df.g_name, 'left')
        next_node_df = leftt.select(next_node_df.g_name, next_node_df.g_parent_id, visited_node_list_df.g_name.alias('dup'))
        next_node_df = next_node_df.fillna({'dup':'xxxxxx'})
        next_node_df = next_node_df.filter(next_node_df.dup == 'xxxxxx')
        next_node_df = next_node_df.drop(next_node_df.dup)


        # add the g_name to the list of visited nodes 
        # TODO: make more efficient!
        visited_df = next_node_df.select(next_node_df.g_name)
        visited_node_list_df = visited_node_list_df.unionAll(visited_df)
        visited_node_list_df = visited_node_list_df.dropDuplicates()
        
        count_up = next_node_df.count()
        n_nodes = count_down - count_up
        print 'number of godwin nodes of heignt', depth, '=', n_nodes
        nodes_per_depth[depth] = n_nodes
        count_down = count_up

        godwin_node_df = next_node_df

    avg = compute_average_godwin(nodes_per_depth)
    print 'The average distance to the godwin words is', avg

    fp = open(output_directory + 'average.txt')
    fp.write(str(avg) + '\n')
    fp.close()
# Get all the ratings rows of our user
dfUserRatings  = dfRates.filter(dfRates.userId == USER_ID).map(lambda r: r.accoId).collect()
print(dfUserRatings)

# Returns only the accommodations that have not been rated by our user
rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))

#[START split_sets]
rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
#[END split_sets]

#[START predict]
# Build our model with the best found values
# Rating, Rank, Iteration, Regulation
model = ALS.train(rddTraining, BEST_RANK, BEST_ITERATION, BEST_REGULATION)

# Calculate all predictions
predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))

# Take the top 5 ones
topPredictions = predictions.takeOrdered(5, key=lambda x: -x[2])
print(topPredictions)

schema = StructType([StructField("userId", StringType(), True), StructField("accoId", StringType(), True), StructField("prediction", FloatType(), True)])

#[START save_top]
dfToSave = sqlContext.createDataFrame(topPredictions, schema)
dfToSave.write.jdbc(url=jdbcUrl, table=TABLE_RECOMMENDATIONS, mode='overwrite')
#[END save_top]
class LogisticRegression:

    def __init__(self):
        # configuring spark
        self.spark_conf = SparkConf()
        self.sc = SparkContext(conf=self.spark_conf)
        self.sql_context = SQLContext(self.sc)

    def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)

            train, test = spark_df.randomSplit([train_split, test_split], seed=1000000)

            X_train = train.select(*feature_columns).map(lambda x: list(x))
            y_train = train.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))

            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(train_data,
                                                               numClasses=numOfClasses, regParam=0,
                                                               regType=regularization, intercept=True,
                                                               iterations=num_of_iterations, validateData=False)

            X_test = test.select(*feature_columns).map(lambda x: list(x))
            y_test = test.select(target).map(lambda x: x[0])

            prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp))))
            prediction_and_label = prediction.zip(y_test)

            LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean())
        except Exception as e:
            raise e

    def train(self, df, target, regularization=None, num_of_iterations=100):
        try:
            LOGGER.info("Generation logistic regression")

            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            feature_columns.remove(target)


            X_train = spark_df.select(*feature_columns).map(lambda x: list(x))
            y_train = spark_df.select(target).map(lambda x: x[0])

            zipped = y_train.zip(X_train)
            train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1]))
            numOfClasses = len(df[target].unique())

            logistic_model = LogisticRegressionWithLBFGS.train(train_data,
                                                               numClasses=numOfClasses, regParam=0,
                                                               regType=regularization, intercept=True,
                                                               iterations=num_of_iterations, validateData=False)


            self.model = logistic_model

        except Exception as e:
            raise e


    def persist(self, location):
        try:
            LOGGER.info("Writing the model to location %s"%location)
            data = 'data'
            meta_data = 'metadata'

            data_location = os.path.join(location, data)
            if os.path.exists(data_location):
                LOGGER.info("Removing directory %s"%data_location)
                shutil.rmtree(data_location)

            data_location = os.path.join(location, meta_data)
            if os.path.exists(data_location):
                LOGGER.info("Removing directory %s"%data_location)
                shutil.rmtree(data_location)

            self.model.save(self.sc, location)
        except Exception as e:
            raise e


    def predict(self, df):
        try:
            LOGGER.info("Predicting using logistic regression")
            spark_df = self.sql_context.createDataFrame(df)
            feature_columns = spark_df.columns
            inp_data = spark_df.select(*feature_columns).map(lambda x: list(x))
            inp_data = spark_df.map(lambda x: list(x))
            result = self.model.predict(inp_data.map(lambda x: x)).collect()
            LOGGER.info("Predicted output is %s"%str(result))
            return result

        except Exception as e:
            raise e

    def load(self, location):
        try:
            self.model = LogisticRegressionModel.load(self.sc, location)
        except Exception as e:
            raise e