Python get_dataframe_not_nested 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: kartothek.serialization.testing

메소드/함수: get_dataframe_not_nested

hotexamples.com에서의 예제들: 9

Python get_dataframe_not_nested - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 kartothek.serialization.testing.get_dataframe_not_nested에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: filter.py 프로젝트: xhochy/kartothek

    def setup(self, column, filter_size, array_size):

        if column == "null":
            raise NotImplementedError()
        ser = (get_dataframe_not_nested(array_size).sample(
            frac=1.0).reset_index(drop=True)[column])
        self.arr = ser.values
        self.value = ser.sample(n=filter_size).unique()

예제 #2

파일 보기

def dataset_to_copy(store):
    df = get_dataframe_not_nested(10)
    store_dataframes_as_dataset(
        dfs=[df],
        dataset_uuid=SRC_DS_UUID,
        store=store,
        partition_on=[df.columns[0]],
        secondary_indices=[df.columns[1]],
    )

예제 #3

파일 보기

def test_store_input_types(store_input_types, bound_store_dataframes):
    from kartothek.serialization.testing import get_dataframe_not_nested

    dataset_uuid = "dataset_uuid"
    df = get_dataframe_not_nested(10)

    assert bound_store_dataframes(
        [df],
        dataset_uuid=dataset_uuid,
        store=store_input_types,
        partition_on=[df.columns[0]],
        secondary_indices=[df.columns[1]],
    )

예제 #4

파일 보기

파일: filter.py 프로젝트: xhochy/kartothek

 def setup(self, predicate):
     if predicate == "conjunctions":
         self.predicate = [[
             ("int16", ">", 123),
             ("int32", "<", 321),
             ("bool", "==", True),
             ("bool", "==", True),
         ]]
     elif predicate == "disjunctions":
         self.predicate = [
             [("int16", ">", 123)],
             [("int32", "<", 321)],
             [("int32", "<", 321)],
             [("int32", "<", 321)],
         ]
     self.df = get_dataframe_not_nested(10**5)

예제 #5

파일 보기

파일: read.py 프로젝트: stephan-hesselmann-by/kartothek

def test_store_input_types(store_input_types, bound_load_dataframes):
    from kartothek.io.eager import store_dataframes_as_dataset
    from kartothek.serialization.testing import get_dataframe_not_nested

    dataset_uuid = "dataset_uuid"
    df = get_dataframe_not_nested(10)

    store_dataframes_as_dataset(
        dfs=[df],
        dataset_uuid=dataset_uuid,
        store=store_input_types,
        partition_on=[df.columns[0]],
        secondary_indices=[df.columns[1]],
    )

    # Use predicates to trigger partition pruning with indices
    predicates = [[
        (df.columns[0], "==", df.loc[0, df.columns[0]]),
        (df.columns[1], "==", df.loc[0, df.columns[1]]),
    ]]

    result = bound_load_dataframes(
        dataset_uuid=dataset_uuid,
        store=store_input_types,
        predicates=predicates,
        dates_as_object=True,
    )

    if isinstance(result, list):
        result = result[0]

    if isinstance(result, MetaPartition):
        result = result.data

    if isinstance(result, dict):
        result = result[SINGLE_TABLE]

    pdt.assert_frame_equal(result, df.head(1), check_dtype=False)

예제 #6

파일 보기

 def setup(self, num_rows, chunk_size):
     self.df = get_dataframe_not_nested(num_rows)
     self.serialiser = ParquetSerializer(chunk_size=chunk_size)
     self.store = get_store_from_url("memory://")
     self.key = self.serialiser.store(self.store, "key_prefix", self.df)
     self.predicates = [[("int16", "==", 123)]]

예제 #7

파일 보기

파일: filter.py 프로젝트: xhochy/kartothek

 def setup(self, column):
     if column == "null":
         raise NotImplementedError()
     self.arr = (get_dataframe_not_nested(
         10**5).sample(frac=1.0).reset_index(drop=True)[column].values)
     self.value = self.arr[12345]

예제 #8

파일 보기

파일: conftest.py 프로젝트: xhochy/kartothek

def dataframe_not_nested():
    return get_dataframe_not_nested(10)

예제 #9

파일 보기

파일: startup.py 프로젝트: kagharpure/kartothek_hive_integration_test

    print(f"Hive query: {hive_query}")
    cursor.execute(hive_query)

    # Get column names from query substring
    selected_columns = [
        l.strip().split(" ")[0]
        for l in selected_columns_and_dtypes.splitlines()
    ]
    # Read hive table into pandas
    hive_df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn)
    hive_df.columns = selected_columns
    # Pyarrow stores timestamp as microseconds from epoch, convert to date
    hive_df["datetime64"] = pd.to_datetime(hive_df.loc[:, "datetime64"] * 1000,
                                           unit="ns")
    # Output from hive is a string, parse this to date
    hive_df["date_"] = pd.to_datetime(
        hive_df.loc[:, "date_"], format="%Y-%m-%d").apply(lambda x: x.date())

    # Ignore dtype for numeric comparisons (e.g. int32 with int64)
    pdt.assert_frame_equal(df[selected_columns], hive_df, check_dtype=False)
    print(f"Test completed for the following data types: {selected_columns}")


# Create dataset on local filesystem
store_factory = partial(storefact.get_store_from_url,
                        f"hfs://{VOLUME_LOCATION}")

df = get_dataframe_not_nested(100)
# Rename because `date` and `null` are reserved in Hive QL
df = df.rename(columns={"date": "date_", "null": "null_"})
assert_hive_compat(df, store_factory, uuid="test")