Python DataFrame.collect示例

def assert_results(result: DataFrame) -> None:
    """
    Shared asserts for the different formats of CSV file, all of which contain the same data.
    """
    # Assert
    assert result.count() == 3

    assert result.collect()[1][0] == "2"
    assert result.collect()[1][1] == "bar"
    assert result.collect()[1][2] == "bar2"

示例#2

显示文件

文件： __init__.py 项目： mnsr002/spark-esri

def insert_df_xy(df: DataFrame,
                 name: str,
                 ws: str = "memory",
                 spatial_reference: int = 3857) -> None:
    """Create ephemeral point feature class from given dataframe.

    Note - It is assume that the first two data fields are the point x/y values.

    :param df: A dataframe.
    :param name: The name of the feature class.
    :param ws: The feature class workspace.
    :param spatial_reference: The feature class spatial reference.
    """
    fields = _df_to_fields(df, 2)
    rows = df.collect()
    insert_rows_xy(rows, name, fields, ws, spatial_reference)

示例#3

显示文件

文件： flat_file.py 项目： dckc/naaccr-tumor-data

def naaccr_read_fwf(flat_file: DataFrame,
                    record_layout: DataFrame,
                    value_col: str = 'value',
                    exclude_pfx: str = 'reserved') -> DataFrame:
    """
    @param flat_file: as from spark.read.text()
                      typically with .value
    @param record_layout: as from http://datadictionary.naaccr.org/?c=7
                          with .start, .length, .xmlId
    """
    fields = [
        func.substring(flat_file[value_col], item.start,
                       item.length).alias(item.xmlId)
        for item in record_layout.collect()
        if not item.xmlId.startswith(exclude_pfx)
    ]  # type: List[Union[Column, str]]
    return flat_file.select(fields)

示例#4

显示文件

文件： __init__.py 项目： mnsr002/spark-esri

def insert_df_hex(df: DataFrame,
                  name: str,
                  size: float,
                  ws: str = "memory") -> None:
    """Create ephemeral polygon feature class from given dataframe.

    Note - It is assume that the first field is the hex nume value.

    :param df: A dataframe.
    :param name: The name of the feature class.
    :param size: The hex size in meters.
    :param ws: The feature class workspace.
    """
    layout = Layout(size)
    fields = _df_to_fields(df, 1)
    rows = df.collect()
    with insert_cursor(name, fields, ws=ws, shape_format="") as cursor:
        for nume, *tail in rows:
            coords = Hex.from_nume(nume).to_coords(layout)
            cursor.insertRow((coords, *tail))

示例#5

显示文件

文件： __init__.py 项目： mnsr002/spark-esri

def insert_df(df: DataFrame,
              name: str,
              ws: str = "memory",
              spatial_reference: int = 3857,
              shape_type: str = "POLYGON",
              shape_format: str = "WKB") -> None:
    """Create an ephemeral feature class given a dataframe.

    Note - it is assumed that the first data field is the shape field.

    :param df: A dataframe.
    :param name: The name of the feature class.
    :param ws: The output workspace. Default="memory".
    :param spatial_reference: The spatial reference id. Default=2857.
    :param shape_type: The feature class shape type (POINT,POLYGON,POLYLINE,MULTIPOINT). Default="POLYGON".
    :param shape_format: The shape format (WKB, WKT, ''). Default="WKB".
    """
    fields = _df_to_fields(df, 1)
    rows = df.collect()
    insert_rows(rows, name, fields, ws, spatial_reference, shape_type,
                shape_format)

示例#6

显示文件

    def synthesize_data(self,
                        stats_nom: DataFrame,
                        record_layout: DataFrame,
                        qty: int = 100) -> pd.DataFrame:
        spark = self.__spark

        create_object(self.t_item_view, self.concepts_script, spark)
        create_object(self.txform_view, self.txform_script, spark)

        stats_nom.createOrReplaceTempView(self.agg_view)

        entity = spark.createDataFrame([(ix, ) for ix in range(0, qty)],
                                       ['case_index'])
        entity.createOrReplaceTempView(self.entity_view)
        # simulated_entity.limit(5).toPandas()

        for view in self.views:
            create_object(view, self.script, spark)
        spark.catalog.cacheTable(self.views[-1])

        # ISSUE: SQL goes in .sql files
        sim_records_nom = spark.sql('''
        select data.case_index, data.xmlId, data.value
        from simulated_naaccr_nom data
        join record_layout rl on rl.xmlId = data.xmlId
        join section on rl.section = section.section
        order by case_index, rl.start
        ''').toPandas()
        sim_records_nom = sim_records_nom.pivot(index='case_index',
                                                columns='xmlId',
                                                values='value')
        for col in sim_records_nom.columns:
            sim_records_nom[col] = sim_records_nom[col].astype('category')

        col_start = {row.xmlId: row.start for row in record_layout.collect()}
        sim_records_nom = sim_records_nom[sorted(
            sim_records_nom.columns, key=lambda xid: col_start[xid])]
        return sim_records_nom