def test_sample(self): ed_flights_small = self.ed_flights_small() first_sample = ed_flights_small.sample(n=10, random_state=self.SEED) second_sample = ed_flights_small.sample(n=10, random_state=self.SEED) assert_frame_equal(eland_to_pandas(first_sample), eland_to_pandas(second_sample))
def test_sample_head(self): ed_flights = self.ed_flights_small() sample_ed_flights = ed_flights.sample(n=10, random_state=self.SEED) sample_pd_flights = self.build_from_index( eland_to_pandas(sample_ed_flights)) pd_head_5 = sample_pd_flights.head(5) ed_head_5 = sample_ed_flights.head(5) assert_frame_equal(pd_head_5, eland_to_pandas(ed_head_5))
def test_sample_shape(self): ed_flights = self.ed_flights_small() sample_ed_flights = ed_flights.sample(n=10, random_state=self.SEED) sample_pd_flights = self.build_from_index( eland_to_pandas(sample_ed_flights)) assert sample_pd_flights.shape == sample_ed_flights.shape
def test_notna(self): ed_ecommerce = self.ed_ecommerce() pd_ecommerce = eland_to_pandas(ed_ecommerce) for column in self.columns: notna_ed_ecommerce = ed_ecommerce[ed_ecommerce[column].notna()] notna_pd_ecommerce = pd_ecommerce[pd_ecommerce[column].notna()] assert_pandas_eland_frame_equal(notna_pd_ecommerce, notna_ed_ecommerce)
def test_isna(self): ed_ecommerce = self.ed_ecommerce() pd_ecommerce = eland_to_pandas(ed_ecommerce) isna_ed_ecommerce = ed_ecommerce[ ed_ecommerce["geoip.region_name"].isna()] isna_pd_ecommerce = pd_ecommerce[ pd_ecommerce["geoip.region_name"].isna()] assert_pandas_eland_frame_equal(isna_pd_ecommerce, isna_ed_ecommerce)
def test_sample_basic(self): ed_flights_small = self.ed_flights_small() sample_ed_flights = ed_flights_small.sample(n=10, random_state=self.SEED) pd_from_eland = eland_to_pandas(sample_ed_flights) # build using index sample_pd_flights = self.build_from_index(pd_from_eland) assert_frame_equal(sample_pd_flights, pd_from_eland)
def load_data_ids(self, path, dataset_name=''): """ This will read data frame from es :param path: String path to index in es :param dataset_name: String name of the dataset for logging :return: Pandas dataframe containing raw data """ df, dt_col = self.load_data(path, dataset_name) return ed.eland_to_pandas(df[dt_col]).values
def test_sample_on_boolean_filter(self): ed_flights = self.ed_flights_small() columns = [ "timestamp", "OriginAirportID", "DestAirportID", "FlightDelayMin" ] sample_ed_flights = ed_flights[columns].sample(n=5, random_state=self.SEED) pd_from_eland = eland_to_pandas(sample_ed_flights) sample_pd_flights = self.build_from_index(pd_from_eland) assert_frame_equal(sample_pd_flights, pd_from_eland)
def batch_to_memory(self, batch_start_dt, batch_end_dt, dataset_name, path): """ This will load Eland df to memory using pandas. :param batch_start_dt: start of the minibatch date :param batch_end_dt: end of the minibatch date :param dataset_name: dataset name for logging :param path: ES index name :return: Pandas dataframe containing raw data """ (df, dt_col) = self.load_data(path, dataset_name, batch_start_dt, batch_end_dt) return ed.eland_to_pandas(df)
def test_sample_frac_01(self): frac = 0.15 ed_flights = self.ed_flights_small().sample(frac=frac, random_state=self.SEED) pd_from_eland = eland_to_pandas(ed_flights) pd_flights = self.build_from_index(pd_from_eland) assert_frame_equal(pd_flights, pd_from_eland) # assert right size from pd_flights size = len(self.pd_flights_small()) assert len(pd_flights) == int(round(frac * size))
def test_to_csv_full(self): results_file = ROOT_DIR + "/dataframe/results/test_to_csv_full.csv" # Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs ed_flights = self.ed_flights() pd_flights = self.pd_flights() ed_flights.to_csv(results_file) # Converting back from csv is messy as pd_flights is created from a json file pd_from_csv = pd.read_csv( results_file, index_col=0, converters={ "DestLocation": lambda x: ast.literal_eval(x), "OriginLocation": lambda x: ast.literal_eval(x), }, ) pd_from_csv.index = pd_from_csv.index.map(str) pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp) assert_frame_equal(pd_flights, pd_from_csv) # Now read the csv to an index now_millis = int(round(time.time() * 1000)) test_index = FLIGHTS_INDEX_NAME + "." + str(now_millis) ed_flights_from_csv = ed.csv_to_eland( results_file, ES_TEST_CLIENT, test_index, index_col=0, es_refresh=True, es_type_overrides={ "OriginLocation": "geo_point", "DestLocation": "geo_point", }, converters={ "DestLocation": lambda x: ast.literal_eval(x), "OriginLocation": lambda x: ast.literal_eval(x), }, ) pd_flights_from_csv = ed.eland_to_pandas(ed_flights_from_csv) # TODO - there is a 'bug' where the Elasticsearch index returns data in a different order to the CSV print(ed_flights_from_csv.head()) print(pd_flights_from_csv.head()) # clean up index ES_TEST_CLIENT.indices.delete(test_index)
def test_pandas_to_eland_ignore_index(self): df = pd.DataFrame( data={ "A": np.random.rand(3), "B": 1, "C": "foo", "D": pd.Timestamp("20190102"), "E": [1.0, 2.0, 3.0], "F": False, "G": [1, 2, 3], "H": "Long text", # text "I": "52.36,4.83", # geo point }, index=["0", "1", "2"], ) # Now create index index_name = "test_pandas_to_eland_ignore_index" ed_df = ed.pandas_to_eland( df, ES_TEST_CLIENT, index_name, es_if_exists="replace", es_refresh=True, use_pandas_index_for_es_ids=False, es_type_overrides={ "H": "text", "I": "geo_point" }, ) # Check types expected_mapping = { "test_pandas_to_eland_ignore_index": { "mappings": { "properties": { "A": { "type": "double" }, "B": { "type": "long" }, "C": { "type": "keyword" }, "D": { "type": "date" }, "E": { "type": "double" }, "F": { "type": "boolean" }, "G": { "type": "long" }, "H": { "type": "text" }, "I": { "type": "geo_point" }, } } } } mapping = ES_TEST_CLIENT.indices.get_mapping(index_name) assert expected_mapping == mapping # Convert back to pandas and compare with original pd_df = ed.eland_to_pandas(ed_df) # Compare values excluding index assert df.values.all() == pd_df.values.all() # Ensure that index is populated by ES. assert not (df.index == pd_df.index).any() ES_TEST_CLIENT.indices.delete(index=index_name)
def test_eland_to_pandas_performance(self): # TODO quantify this ed.eland_to_pandas(self.ed_flights(), show_progress=True)