def extract_pattern(l):
    ptn1 = ".*?\d+0000000000000000(.*?)\s+(\d{4})\.\s+0\..*?"
    ptn2 = ".*?\s+0000000000000000(.*?)\s+(\d{4})\.\s+0\..*?"
    ptn3 = ".*?\d+0000000000000000(.*?)0\..*?"
    ptn4 = ".*?\s+0000000000000000(.*?)0\..*?"

    m1 = re.search(ptn1, l)
    m2 = re.search(ptn2, l)
    m3 = re.search(ptn3, l)
    m4 = re.search(ptn4, l)

    if m1 != None:
        return Row(Merchant=m1.group(1),
                   Category=m1.group(2),
                   Label='merchant')
    elif m2 != None:
        return Row(Merchant=m2.group(1),
                   Category=m2.group(2),
                   Label='merchant')
    elif m3 != None:
        return Row(Merchant=m3.group(1), Category=None, Label='ept')
    elif m4 != None:
        return Row(Merchant=m4.group(1), Category=None, Label='ept')
    else:
        Row(Merchant=l, Category=None, Label='error')
示例#2
0
    def mr_api(trip_rdd):
        """This function is used to execute Q1 using RDD"""

        # Keep necessary data
        trip_rdd = trip_rdd.map(lambda row: (Row(
            Hour=row._c1.split()[1][:2], Latitude=row._c3, Longitude=row._c4)))

        # Remove dirty rows
        filtered_rdd = trip_rdd.filter(lambda row: (row.Latitude != '0') &
                                       (row.Longitude != '0'))

        # Convert coords to Float
        convert_to_float = filtered_rdd.map(
            lambda row: Row(Hour=row.Hour,
                            Latitude=float(row.Latitude),
                            Longitude=float(row.Longitude)))

        # Transform to (Hour, (Lat, Lon, 1.0))
        keys = convert_to_float.map(
            lambda row: (row.Hour, (row.Latitude, row.Longitude, 1.0)))

        # ReduceBy Hour and divide by total appearances
        accumulated = keys.reduceByKey(lambda a, b:
                                       (a[0] + b[0], a[1] + b[1], a[2] + b[2]))
        groups = accumulated.map(lambda row: (row[0], (row[1][0] / row[1][2],
                                                       row[1][1] / row[1][2])))

        # Sort By Hour
        sorted_groups = groups.sortByKey()
        list_groups = sorted_groups.collect()
        for element in list_groups:
            print(element)
示例#3
0
def test_merge_schemas_simple(spark_session: SparkSession):
    df1 = spark_session.createDataFrame([
        (0, "potato", "0.5", "100"),
        (1, "onion", "0.5", "150"),
    ], ["id", "product", "weight", "price"])
    df2 = spark_session.createDataFrame([
        (2, "CyberPunk2077", 1, "3000"),
        (3, "TENET", 1, "2000"),
    ], ["id", "product", "amount", "price"])
    merger = SchemaMerging()
    result = merger.union(df1, df2)
    """
    My result is slightly different: columns order
    [Row(id=0, product='potato', price='100', amount=None, weight='0.5'),
    Row(id=1, product='onion', price='150', amount=None, weight='0.5'),
    Row(id=2, product='CyberPunk2077', price='3000', amount=1, weight=None),
    Row(id=3, product='TENET', price='2000', amount=1, weight=None)]
    """
    assert result.collect() == [
        Row(id=0, product='potato', weight='0.5', price='100', amount=None),
        Row(id=1, product='onion', weight='0.5', price='150', amount=None),
        Row(id=2, product='CyberPunk2077', weight=None, price='3000',
            amount=1),
        Row(id=3, product='TENET', weight=None, price='2000', amount=1)
    ]
示例#4
0
        def _predict_per_partition_inner(ratings):
            """
            Predict per partition if grouped option is disabled

            Returns
            -------
            DataFrame
            """
            prev_user = None
            curr_user = None
            for r in ratings:
                curr_user = r["user"]
                if prev_user == curr_user:
                    PR_u[r["item"]] = r["rating"]
                else:
                    if prev_user:
                        for el in sorted(_predict_per_user(PR_u), key=lambda x: -x[1])[:top_N_ratings]:
                            yield Row(user=prev_user, item=el[0], rating_pred=el[1])
                    PR_u = dict()
                    PR_u[r["item"]] = r["rating"]
                    prev_user = curr_user
            # Emit values of the last user in the partition
            if curr_user and curr_user == prev_user:
                for el in sorted(_predict_per_user(PR_u), key=lambda x: -x[1])[:top_N_ratings]:
                    yield Row(user=prev_user, item=el[0], rating_pred=el[1])
示例#5
0
def test_merge_schemas_diff_types(spark_session: SparkSession):
    df1 = spark_session.createDataFrame([
        (0, "potato", "0.5", 100),
        (1, "onion", "0.5", 150),
    ], ["id", "product", "weight", "price"])
    df2 = spark_session.createDataFrame([
        (2, "apple", "1", "300"),
        (3, "pineapple", "1", "200"),
    ], ["id", "product", "weight", "price"])
    merger = SchemaMerging()
    result = merger.union(df1, df2)
    assert result.collect() == [
        Row(id=0,
            product='potato',
            weight='0.5',
            price_bigint='100',
            price_string=None),
        Row(id=1,
            product='onion',
            weight='0.5',
            price_bigint='150',
            price_string=None),
        Row(id=2,
            product='apple',
            weight='1',
            price_bigint='300',
            price_string=None),
        Row(id=3,
            product='pineapple',
            weight='1',
            price_bigint='200',
            price_string=None)
    ]
示例#6
0
def test_note_parsing(wikis, spark_session):
    transformed_df = wikis.withColumn(
        "found_on", ImageRecommendation.found_on).select("found_on")
    expected_df = spark_session.createDataFrame([
        Row(found_on=["ruwiki", "itwiki", "enwiki"]),
        Row(found_on=[""]),
        Row(found_on=None),
    ])
    assert_shallow_equals(transformed_df, expected_df)
示例#7
0
def test_get_top_campaigns(pyspark, purchase_attribution):
    result = get_top_campaigns(pyspark, purchase_attribution).collect()

    correct = [
        Row(campaignId='cmp1', revenue=300.5),
        Row(campaignId='cmp2', revenue=125.2)
    ]

    assert correct == result
示例#8
0
def test_get_top_channels(pyspark, sessions):
    result = get_top_channels(pyspark, sessions).collect()

    correct = [
        Row(campaignId='cmp1', channelId='Google Ads', performance=2),
        Row(campaignId='cmp2', channelId='Yandex Ads', performance=2)
    ]

    assert correct == result
示例#9
0
def sample_dataframe(context: TinyQueryContext):
    return context.createDataFrame([
        Row(a=1, b="x"),
        Row(a=2, b="z"),
        Row(a=3, b="z"),
    ],
                                   StructType([
                                       StructField('a', LongType()),
                                       StructField('b', StringType()),
                                   ]))
示例#10
0
文件: moder.py 项目: y1026/ml
 def extract_functions_from_row(self, row: Row):
     uastbytes = row[EngineConstants.Columns.Uast]
     if not uastbytes:
         return
     uast = self.parse_uast(uastbytes[0])
     template = row.asDict()
     for func, name in self.extract_functions_from_uast(uast):
         data = template.copy()
         data[EngineConstants.Columns.Uast] = [bytearray(self.serialize_uast(func))]
         data[EngineConstants.Columns.BlobId] += "_%s:%d" % (name, func.start_position.line)
         yield Row(**data)
    def test_rf_rescale_per_tile(self):
        x1 = Tile(np.random.randint(-20, 42, (10, 10)), CellType.int8())
        x2 = Tile(np.random.randint(20, 242, (10, 10)), CellType.int8())
        df = self.spark.createDataFrame([Row(x=x1), Row(x=x2)])
        result = df.select(rf_rescale('x').alias('x_prime')) \
            .agg(rf_agg_stats('x_prime').alias('stat')) \
            .select('stat.min', 'stat.max') \
            .first()

        self.assertEqual(result[0], 0.0)
        self.assertEqual(result[1], 1.0)
示例#12
0
def test_unpivot_data_no_dynamic_cols(spark_session: SparkSession):
    df = spark_session.createDataFrame([(1, 'Ivan'), (2, 'Maria')],
                                       ['id', 'name'])

    unpivot_transform = Unpivot(['id', 'name'])

    actual_df = unpivot_transform.unpivot(df)
    assert actual_df.columns == ['id', 'name']
    assert actual_df.collect() == [
        Row(id=1, name='Ivan'),
        Row(id=2, name='Maria')
    ]
示例#13
0
def test_get_latest_transaction_date_returns_most_recent_date(spark):
    spark.createDataFrame([
        Row(date_of_purchase=datetime(2018, 12, 1, 4, 15, 0)),
        Row(date_of_purchase=datetime(2019, 3, 1, 14, 10, 0)),
        Row(date_of_purchase=datetime(2019, 2, 1, 14, 9, 59)),
        Row(date_of_purchase=datetime(2019, 1, 2, 19, 14, 20))
    ]).createOrReplaceTempView("raw_transactions")

    expected = datetime(2019, 3, 1, 14, 10, 0)
    actual = get_latest_transaction_date(spark)

    assert actual == expected
示例#14
0
    def execute(spark_sql_shuffle_partitions = 200):
        sc = SparkContext(conf=SparkConf().setAppName("sample").setMaster("local").set("spark.sql.shuffle.partitions", spark_sql_shuffle_partitions))
        spark = SparkSession(sc)
        employee = sc.parallelize([Row(name="Bob"), Row(name="Alice")]).toDF()  # To DF solo está si creamos un Spark Sesion
        department = sc.parallelize([Row(name="Bob", department="Accounts", age="30"), Row(name="Alice", department="Sales", age="20")]).toDF()
        start = timer()

        employee.repartition(5).where("name = 'Bob'").explain()

        employee.join(department, "name").show()
        print("spark_sql_shuffle_partitions = {}, tiempo = {}".format(spark_sql_shuffle_partitions, timer() - start))
        sc.stop()
示例#15
0
文件: basic.py 项目: absognety/ml
 def deserialize_uast(self, row: Row):
     if not row[EngineConstants.Columns.Uast]:
         return
     row_dict = row.asDict()
     row_dict[EngineConstants.Columns.Uast] = []
     for i, uast in enumerate(row[EngineConstants.Columns.Uast]):
         try:
             row_dict[EngineConstants.Columns.Uast].append(self.parse_uast(uast))
         except:  # nopep8
             self._log.error("\nBabelfish Error: Failed to parse uast for document %s for uast "
                             "#%s" % (row[Uast2BagFeatures.Columns.document], i))
     yield Row(**row_dict)
示例#16
0
文件: test_tfidf.py 项目: y1026/ml
    def test_call(self):
        baseline = {
            Row(d=dict(i)["d"], t=dict(i)["t"],
                v=log_tf_log_idf(dict(i)["v"], int(dict(i)["t"]), self.docs))
            for i in tfidf_data.term_freq_result
        }

        result = self.tfidf(
            self.session.sparkContext
                .parallelize(tfidf_data.term_freq_result)
                .map(lambda x: Row(**dict(x)))).collect()
        self.assertEqual(set(result), baseline)
示例#17
0
    def adjustRow(row: Row) -> Row:
        rowDict = row.asDict()
        parentRow = rowDict.pop(parentColName)
        if parentRow is None:
            return row

        parentDict = parentRow.asDict()
        newChildValue = mapping(parentDict[childColName])
        if not addAsNewChild:
            parentDict.pop(childColName)
        parentDict[newChildColName] = newChildValue
        rowDict[parentColName] = Row(**parentDict)
        return Row(**rowDict)
示例#18
0
    def test_unpersist_rdd(self):
        # Arrange
        rdd = self.context.spark.sparkContext.parallelize([Row(column1='aa')])
        second_rdd = self.context.spark.sparkContext.parallelize(
            [Row(column1='bb')])
        cardo_dataframe = CardoDataFrame(rdd, '')
        cardo_dataframe.persist()
        cardo_dataframe.rdd = second_rdd

        # Act
        cardo_dataframe.unpersist()

        # Assert
        self.assertFalse(rdd.is_cached)
def main():
    business_rdd = sc.textFile('dataset/yelp_academic_dataset_business.json')\
        .map(json.loads)\
        .map(lambda x: (x['business_id'],x['name'], x['city'], int(x['stars']), int(x['review_count']),
                        int(x['is_open']), x['categories']))\
        .filter(lambda x: x[4] > 0)\
        .filter(lambda x: x[5] == 1)\
        .filter(lambda x: x[6] and len(x[6])>0 and 'Restaurants' in x[6]).cache()
    business_dict = business_rdd.map(lambda x: x[0]).distinct().zipWithIndex().map(lambda x: (x[0], x[1]+1)).collectAsMap()
    with open('business_dict', 'w') as f:
        f.write(json.dumps(business_dict))

    business_rdd = business_rdd\
        .map(lambda x: (business_dict[x[0]], x[1], x[2], x[3], x[4], x[5], x[6]))\
        .map(lambda x: Row(business_id=x[0], name=x[1], city=x[2], stars=x[3], review_count=x[4], categories=x[6]))

    business_df = spark.createDataFrame(business_rdd)
    business_df.write.parquet('output/business.parquet')

    review_rdd = sc.textFile('dataset/yelp_academic_dataset_review.json')\
        .map(json.loads)\
        .map(lambda x: (x['review_id'],x['user_id'], x['business_id'], int(x['stars'])))\
        .filter(lambda x: x[2] in business_dict.keys()).cache()
    user_list_in_reviews = review_rdd.map(lambda x: x[1]).distinct().collect()
    user_rdd = sc.textFile('dataset/yelp_academic_dataset_user.json') \
        .map(json.loads) \
        .map(lambda x: (x['user_id'], x['name'], int(x['review_count']))) \
        .filter(lambda x: x[2] > 0)\
        .filter(lambda x: x[0] in user_list_in_reviews).cache()
    user_dict = user_rdd.map(lambda x: x[0]).distinct().zipWithIndex().map(lambda x: (x[0], x[1] + 1)).collectAsMap()
    with open('user_dict', 'w') as f:
        f.write(json.dumps(user_dict))
    user_rdd = user_rdd \
        .map(lambda x: (user_dict[x[0]], x[1], x[2])) \
        .map(lambda x: Row(user_id=x[0], name=x[1], review_count=x[2]))
    user_df = spark.createDataFrame(user_rdd)
    user_df.write.parquet('output/user.parquet')
    review_dict = review_rdd.map(lambda x: x[0]).distinct().zipWithIndex().map(lambda x: (x[0], x[1]+1)).collectAsMap()
    with open('review_dict', 'w') as f:
        f.write(json.dumps(review_dict))
    review_rdd = review_rdd\
        .map(lambda x: (review_dict[x[0]] if x[0] in review_dict else 0,
                        user_dict[x[1]] if x[1] in user_dict else 0,
                        business_dict[x[2]] if x[2] in business_dict else 0,
                        x[3]))\
        .filter(lambda x: x[0] is not 0 and x[1] is not 0 and x[2] is not 0)\
        .map(lambda x: Row(review_id=x[0], user_id=x[1], business_id=x[2], stars=x[3]))
    review_df = spark.createDataFrame(review_rdd)

    review_df.write.parquet('output/review.parquet')
示例#20
0
def join_table(context):
    df = context.createDataFrame([
        Row(a=1, c=7, d="a"),
        Row(a=2, c=9, d="b"),
        Row(a=2, c=11, d="c"),
    ],
                                 StructType([
                                     StructField('a', LongType()),
                                     StructField('c', LongType()),
                                     StructField('d', StringType()),
                                 ]))
    table_name = 'join_table'
    df.write.saveAsTable(table_name)
    yield context.read.table(table_name)
    context.deleteTable(table_name)
示例#21
0
def test_merge_schemas_no_common(spark_session: SparkSession):
    df1 = spark_session.createDataFrame([
        ('uuid1', "honda", "50000"),
        ('uuid2', "toyota", "60000"),
    ], ["uuid", "car", "mileage"])
    df2 = spark_session.createDataFrame([
        (2, "apple", "1", "300"),
        (3, "pineapple", "1", "200"),
    ], ["id", "product", "weight", "price"])
    merger = SchemaMerging()
    result = merger.union(df1, df2)
    """
    My result is slightly different: columns order
    [Row(id=None, weight=None, product=None, price=None, uuid='uuid1', car='honda', mileage='50000'),
    Row(id=None, weight=None, product=None, price=None, uuid='uuid2', car='toyota', mileage='60000'),
    Row(id=2, weight='1', product='apple', price='300', uuid=None, car=None, mileage=None),
    Row(id=3, weight='1', product='pineapple', price='200', uuid=None, car=None, mileage=None)]
    """
    assert result.collect() == [
        Row(uuid='uuid1',
            car='honda',
            mileage='50000',
            id=None,
            product=None,
            weight=None,
            price=None),
        Row(uuid='uuid2',
            car='toyota',
            mileage='60000',
            id=None,
            product=None,
            weight=None,
            price=None),
        Row(uuid=None,
            car=None,
            mileage=None,
            id=2,
            product='apple',
            weight='1',
            price='300'),
        Row(uuid=None,
            car=None,
            mileage=None,
            id=3,
            product='pineapple',
            weight='1',
            price='200')
    ]
示例#22
0
    def process(time, rdd):
        print("========= %s =========" % str(time))

        if (rdd.isEmpty()):

            return

        try:
            # Get the singleton instance of SparkSession
            spark = getSparkSessionInstance(rdd.context.getConf())

            # Convert RDD[String] to RDD[Row] to DataFrame
            rowRdd = rdd.map(lambda w: Row(word=w))
            wordsDataFrame = spark.createDataFrame(rowRdd)

            # Creates a temporary view using the DataFrame.
            wordsDataFrame.createOrReplaceTempView("words")

            # Do word count on table using SQL and print it
            wordCountsDataFrame = \
                spark.sql("select word, count(*) as word_count from words group by word")
            wordCountsDataFrame.show()

            wordCountsDataFrame.write \
            .format("jdbc") \
            .option("url", url) \
            .option("driver", "org.mariadb.jdbc.Driver") \
            .option("dbtable", table_name) \
            .option("user", username) \
            .option("password", pasword) \
            .save(mode="append")

        except Exception as e:
            print("Some error happen!")
            print(e)
示例#23
0
def dict_to_spark_row(unischema, row_dict):
    """Converts a single row into a spark Row object.

    Verifies that the data confirms with unischema definition types and encodes the data using the codec specified
    by the unischema.

    The parameters are keywords to allow use of functools.partial.

    :param unischema: an instance of Unischema object
    :param row_dict: a dictionary where the keys match name of fields in the unischema.
    :return: a single pyspark.Row object
    """
    assert isinstance(unischema, Unischema)
    # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect
    copy_row_dict = copy.copy(row_dict)
    insert_explicit_nulls(unischema, copy_row_dict)

    if set(copy_row_dict.keys()) != set(unischema.fields.keys()):
        raise ValueError(
            'Dictionary fields \n{}\n do not match schema fields \n{}'.format(
                '\n'.join(sorted(copy_row_dict.keys())),
                '\n'.join(unischema.fields.keys())))

    encoded_dict = {}
    for field_name, value in copy_row_dict.items():
        schema_field = unischema.fields[field_name]
        if value is None:
            if not schema_field.nullable:
                raise ValueError(
                    'Field {} is not "nullable", but got passes a None value')
        encoded_dict[field_name] = schema_field.codec.encode(
            schema_field, value) if value is not None else None

    return Row(**encoded_dict)
示例#24
0
        def fn(rows):
            import math
            import tensorflow as tf
            import tensorflow.keras.backend as K

            if GPU_INFERENCE_ENABLED:
                from pyspark import TaskContext
                config = tf.ConfigProto()
                config.gpu_options.allow_growth = True
                config.gpu_options.visible_device_list = TaskContext.get().resources()['gpu'].addresses[0]
                K.set_session(tf.Session(config=config))
            else:
                # Do not use GPUs for prediction, use single CPU core per task.
                config = tf.ConfigProto(device_count={'GPU': 0})
                config.inter_op_parallelism_threads = 1
                config.intra_op_parallelism_threads = 1
                K.set_session(tf.Session(config=config))

            # Restore from checkpoint.
            model = deserialize_model(model_bytes, tf.keras.models.load_model)

            # Perform predictions.
            for row in rows:
                fields = row.asDict().copy()
                # Convert from log domain to real Sales numbers.
                log_sales = model.predict_on_batch([[row[col]] for col in all_cols])[0]
                # Add 'Sales' column with prediction results.
                fields['Sales'] = math.exp(log_sales)
                yield Row(**fields)
    def test_rf_local_is_in(self):
        from pyspark.sql.functions import lit, array, col
        from pyspark.sql import Row

        nd = 5
        t = Tile(np.array([[1, 3, 4], [nd, 0, 3]]),
                 CellType.uint8().with_no_data_value(nd))
        # note the convert is due to issue #188
        df = self.spark.createDataFrame([Row(t=t)]) \
            .withColumn('a', array(lit(3), lit(4))) \
            .withColumn('in2', rf_convert_cell_type(
                rf_local_is_in(col('t'), array(lit(0), lit(4))),
                'uint8')) \
            .withColumn('in3', rf_convert_cell_type(rf_local_is_in('t', 'a'), 'uint8')) \
            .withColumn('in4', rf_convert_cell_type(
                rf_local_is_in('t', array(lit(0), lit(4), lit(3))),
                'uint8')) \
            .withColumn('in_list', rf_convert_cell_type(rf_local_is_in(col('t'), [4, 1]), 'uint8'))

        result = df.first()
        self.assertEqual(result['in2'].cells.sum(), 2)
        assert_equal(result['in2'].cells, np.isin(t.cells, np.array([0, 4])))
        self.assertEqual(result['in3'].cells.sum(), 3)
        self.assertEqual(result['in4'].cells.sum(), 4)
        self.assertEqual(
            result['in_list'].cells.sum(), 2,
            "Tile value {} should contain two 1s as: [[1, 0, 1],[0, 0, 0]]".
            format(result['in_list'].cells))
    def test_mask(self):
        from pyspark.sql import Row
        from pyrasterframes.rf_types import Tile, CellType

        np.random.seed(999)
        # importantly exclude 0 from teh range because that's the nodata value for the `data_tile`'s cell type
        ma = np.ma.array(np.random.randint(1, 10, (5, 5), dtype='int8'),
                         mask=np.random.rand(5, 5) > 0.7)
        expected_data_values = ma.compressed().size
        expected_no_data_values = ma.size - expected_data_values
        self.assertTrue(expected_data_values > 0,
                        "Make sure random seed is cooperative ")
        self.assertTrue(expected_no_data_values > 0,
                        "Make sure random seed is cooperative ")

        data_tile = Tile(np.ones(ma.shape, ma.dtype), CellType.uint8())

        df = self.spark.createDataFrame([Row(t=data_tile, m=Tile(ma))]) \
            .withColumn('masked_t', rf_mask('t', 'm'))

        result = df.select(rf_data_cells('masked_t')).first()[0]
        self.assertEqual(
            result, expected_data_values,
            f"Masked tile should have {expected_data_values} data values but found: {df.select('masked_t').first()[0].cells}."
            f"Original data: {data_tile.cells}"
            f"Masked by {ma}")

        nd_result = df.select(rf_no_data_cells('masked_t')).first()[0]
        self.assertEqual(nd_result, expected_no_data_values)

        # deser of tile is correct
        self.assertEqual(
            df.select('masked_t').first()[0].cells.compressed().size,
            expected_data_values)
    def test_mask_bits(self):
        t = Tile(42 * np.ones((4, 4), 'uint16'), CellType.uint16())
        # with a varitey of known values
        mask = Tile(
            np.array([[1, 1, 2720, 2720], [1, 6816, 6816, 2756],
                      [2720, 2720, 6900, 2720], [2720, 6900, 6816, 1]]),
            CellType('uint16raw'))

        df = self.spark.createDataFrame([Row(t=t, mask=mask)])

        # removes fill value 1
        mask_fill_df = df.select(
            rf_mask_by_bit('t', 'mask', 0, True).alias('mbb'))
        mask_fill_tile = mask_fill_df.first()['mbb']

        self.assertTrue(mask_fill_tile.cell_type.has_no_data())

        self.assertTrue(
            mask_fill_df.select(rf_data_cells('mbb')).first()[0], 16 - 4)

        # mask out 6816, 6900
        mask_med_hi_cir = df.withColumn('mask_cir_mh',
                                        rf_mask_by_bits('t', 'mask', 11, 2, [2, 3])) \
            .first()['mask_cir_mh'].cells

        self.assertEqual(mask_med_hi_cir.mask.sum(), 5)
    def test_agg_local_mean(self):
        from pyspark.sql import Row
        from pyrasterframes.rf_types import Tile

        # this is really testing the nodata propagation in the agg  local summation
        ct = CellType.int8().with_no_data_value(4)
        df = self.spark.createDataFrame([
            Row(tile=Tile(np.array([[1, 2, 3, 4, 5, 6]]), ct)),
            Row(tile=Tile(np.array([[1, 2, 4, 3, 5, 6]]), ct)),
        ])

        result = df.agg(rf_agg_local_mean('tile').alias('mean')).first().mean

        expected = Tile(np.array([[1.0, 2.0, 3.0, 3.0, 5.0, 6.0]]),
                        CellType.float64())
        self.assertEqual(result, expected)
示例#29
0
class HourFeaturesBuilder(object):
    def __init__(self):
        self._ensure_structure()

    def get_features(self, station, timestamp, window_size=24):
        hour = timestamp.hour

        # e.g. for hour=3 generates indices 2, 1, 0, 24, 23, ...
        indices = head(
            chain(
                xrange(hour-1, -1, -1),
                xrange(24-1, hour-1, -1)
            ),
            window_size
        )

        res = {}
        for i, hour in enumerate(indices):
            hour_data = redis_client.hgetall(self._get_station_hour_key(hour, station))
            try:
                res['n_rents_{}_hb'.format(i)] = int(hour_data['n_rents'])
                res['n_returns_{}_hb'.format(i)] = int(hour_data['n_returns'])
            except Exception, e:
                logger.warn('station {} on hour {} has a weird thing'.format(station, hour))
                return

        # XXX should it return the dict or the row?
        return Row(**res)
示例#30
0
    def to_petastorm(row):
        import numpy as np
        from pyspark import Row

        converted = {}
        for col in schema_cols:
            col_data = row[col]
            if isinstance(col_data, Vector):
                intermediate_format = metadata[col][
                    'intermediate_format'] if metadata else ARRAY
                if intermediate_format == ARRAY:
                    converted[col] = col_data.toArray().tolist()
                elif intermediate_format == CUSTOM_SPARSE:
                    # Currently petastorm does not support reading pyspark sparse vector. We put
                    # the indices and values into one array. when consuming the data, we re-create
                    # the vector from this format.
                    size = len(col_data.indices)
                    padding_zeros = 2 * (metadata[col]['max_size'] -
                                         len(col_data.indices))

                    converted[col] = np.concatenate(
                        (np.array([size]), col_data.indices, col_data.values,
                         np.zeros(padding_zeros))).tolist()

        if converted:
            row = row.asDict().copy()
            row.update(converted)
        return Row(**row)