def setUp(self):
     spark = SparkSession(sparkContext=self.sc)
     data = np.random.randint(0, 9, [100, 10])
     row = pyspark.Row('id', *['feature_' + str(i) for i in range(9)])
     self.x_rdd = self.sc.parallelize(
         list(map(lambda x: row(*x), data.tolist())))
     self.x_df = spark.createDataFrame(self.x_rdd)
示例#2
0
def convert_rdd_raw(row):
    # row[0] : full path to file
    classname = get_classname_from_filename(row[0])
    # row[1] : features as string value
    values = row[1].strip('[]').split(',')
    values = [float(x) for x in values]
    return pyspark.Row(label=classname, features=values)
 def test_do_cartesian(self):
     spark_session = sql.SparkSession(self.sc)
     string_rdd = self.sc.parallelize(self.test_data).map(
         lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2])))
     string_df = string_rdd.toDF()
     test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector')
     check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect()
     for diag in check_diagonal:
         self.assertEqual(1.0, diag)
示例#4
0
def dict_to_spark_row(unischema, row_dict):
    """Converts a single row into a spark Row object.

    Verifies that the data confirms with unischema definition types and encodes the data using the codec specified
    by the unischema.

    The parameters are keywords to allow use of functools.partial.

    :param unischema: an instance of Unischema object
    :param row_dict: a dictionary where the keys match name of fields in the unischema.
    :return: a single pyspark.Row object
    """

    # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
    # (currently works only with make_batch_reader)
    import pyspark

    assert isinstance(unischema, Unischema)
    # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect
    copy_row_dict = copy.copy(row_dict)
    insert_explicit_nulls(unischema, copy_row_dict)

    if set(copy_row_dict.keys()) != set(unischema.fields.keys()):
        raise ValueError(
            'Dictionary fields \n{}\n do not match schema fields \n{}'.format(
                '\n'.join(sorted(copy_row_dict.keys())),
                '\n'.join(unischema.fields.keys())))

    encoded_dict = {}
    for field_name, value in copy_row_dict.items():
        schema_field = unischema.fields[field_name]
        if value is None:
            if not schema_field.nullable:
                raise ValueError(
                    'Field {} is not "nullable", but got passes a None value')
        if schema_field.codec:
            encoded_dict[field_name] = schema_field.codec.encode(
                schema_field, value) if value is not None else None
        else:
            if isinstance(value, (np.generic, )):
                encoded_dict[field_name] = value.tolist()
            else:
                encoded_dict[field_name] = value

    field_list = list(unischema.fields.keys())
    # generate a value list which match the schema column order.
    value_list = [encoded_dict[name] for name in field_list]
    # create a row by value list
    row = pyspark.Row(*value_list)
    # set row fields
    row.__fields__ = field_list
    return row
示例#5
0
def dict_to_row(schema, row_dict):
    import pyspark
    err_msg = 'Dictionary fields \n{}\n do not match schema fields \n{}'\
        .format('\n'.join(sorted(row_dict.keys())), '\n'.join(schema.keys()))
    assert set(row_dict.keys()) == set(schema.keys()), err_msg

    row = {}
    for k, v in row_dict.items():
        schema_field = schema[k]
        if schema_field.feature_type == FeatureType.IMAGE:
            image_path = v
            with open(image_path, "rb") as f:
                img_bytes = f.read()
            row[k] = bytearray(img_bytes)
        elif schema_field.feature_type == FeatureType.NDARRAY:
            memfile = BytesIO()
            np.savez_compressed(memfile, arr=v)
            row[k] = bytearray(memfile.getvalue())
        else:
            row[k] = v
    return pyspark.Row(**row)
示例#6
0
def _load_match(provider: str, match_id: str) -> List[pyspark.Row]:
    dataset = datasets.load(provider, match_id=match_id)
    pdf = kh.fix_kloppy_dataframe(dataset.to_pandas(all_passes=True))
    return [pyspark.Row(match=match_id, **row) for row in pdf.to_dict(orient="records")]