示例#1
0
    def test_sample(self):
        ed_flights_small = self.ed_flights_small()
        first_sample = ed_flights_small.sample(n=10, random_state=self.SEED)
        second_sample = ed_flights_small.sample(n=10, random_state=self.SEED)

        assert_frame_equal(eland_to_pandas(first_sample),
                           eland_to_pandas(second_sample))
示例#2
0
    def test_sample_head(self):
        ed_flights = self.ed_flights_small()
        sample_ed_flights = ed_flights.sample(n=10, random_state=self.SEED)
        sample_pd_flights = self.build_from_index(
            eland_to_pandas(sample_ed_flights))

        pd_head_5 = sample_pd_flights.head(5)
        ed_head_5 = sample_ed_flights.head(5)
        assert_frame_equal(pd_head_5, eland_to_pandas(ed_head_5))
示例#3
0
    def test_sample_shape(self):
        ed_flights = self.ed_flights_small()
        sample_ed_flights = ed_flights.sample(n=10, random_state=self.SEED)
        sample_pd_flights = self.build_from_index(
            eland_to_pandas(sample_ed_flights))

        assert sample_pd_flights.shape == sample_ed_flights.shape
示例#4
0
    def test_notna(self):
        ed_ecommerce = self.ed_ecommerce()
        pd_ecommerce = eland_to_pandas(ed_ecommerce)

        for column in self.columns:
            notna_ed_ecommerce = ed_ecommerce[ed_ecommerce[column].notna()]
            notna_pd_ecommerce = pd_ecommerce[pd_ecommerce[column].notna()]
            assert_pandas_eland_frame_equal(notna_pd_ecommerce,
                                            notna_ed_ecommerce)
示例#5
0
    def test_isna(self):
        ed_ecommerce = self.ed_ecommerce()
        pd_ecommerce = eland_to_pandas(ed_ecommerce)

        isna_ed_ecommerce = ed_ecommerce[
            ed_ecommerce["geoip.region_name"].isna()]
        isna_pd_ecommerce = pd_ecommerce[
            pd_ecommerce["geoip.region_name"].isna()]
        assert_pandas_eland_frame_equal(isna_pd_ecommerce, isna_ed_ecommerce)
示例#6
0
    def test_sample_basic(self):
        ed_flights_small = self.ed_flights_small()
        sample_ed_flights = ed_flights_small.sample(n=10,
                                                    random_state=self.SEED)
        pd_from_eland = eland_to_pandas(sample_ed_flights)

        # build using index
        sample_pd_flights = self.build_from_index(pd_from_eland)

        assert_frame_equal(sample_pd_flights, pd_from_eland)
示例#7
0
    def load_data_ids(self, path, dataset_name=''):
        """
        This will read data frame from es
        :param path: String path to index in es
        :param dataset_name: String name of the dataset for logging
        :return: Pandas dataframe containing raw data
        """
        df, dt_col = self.load_data(path, dataset_name)

        return ed.eland_to_pandas(df[dt_col]).values
示例#8
0
    def test_sample_on_boolean_filter(self):
        ed_flights = self.ed_flights_small()
        columns = [
            "timestamp", "OriginAirportID", "DestAirportID", "FlightDelayMin"
        ]
        sample_ed_flights = ed_flights[columns].sample(n=5,
                                                       random_state=self.SEED)
        pd_from_eland = eland_to_pandas(sample_ed_flights)
        sample_pd_flights = self.build_from_index(pd_from_eland)

        assert_frame_equal(sample_pd_flights, pd_from_eland)
示例#9
0
 def batch_to_memory(self, batch_start_dt, batch_end_dt, dataset_name, path):
     """
     This will load Eland df to memory using pandas.
     :param batch_start_dt: start of the minibatch date
     :param batch_end_dt: end of the minibatch date
     :param dataset_name: dataset name for logging
     :param path: ES index name
     :return: Pandas dataframe containing raw data
     """
     (df, dt_col) = self.load_data(path, dataset_name, batch_start_dt, batch_end_dt)
     return ed.eland_to_pandas(df)
示例#10
0
    def test_sample_frac_01(self):
        frac = 0.15
        ed_flights = self.ed_flights_small().sample(frac=frac,
                                                    random_state=self.SEED)
        pd_from_eland = eland_to_pandas(ed_flights)
        pd_flights = self.build_from_index(pd_from_eland)

        assert_frame_equal(pd_flights, pd_from_eland)

        # assert right size from pd_flights
        size = len(self.pd_flights_small())
        assert len(pd_flights) == int(round(frac * size))
示例#11
0
    def test_to_csv_full(self):
        results_file = ROOT_DIR + "/dataframe/results/test_to_csv_full.csv"

        # Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

        ed_flights.to_csv(results_file)
        # Converting back from csv is messy as pd_flights is created from a json file
        pd_from_csv = pd.read_csv(
            results_file,
            index_col=0,
            converters={
                "DestLocation": lambda x: ast.literal_eval(x),
                "OriginLocation": lambda x: ast.literal_eval(x),
            },
        )
        pd_from_csv.index = pd_from_csv.index.map(str)
        pd_from_csv.timestamp = pd.to_datetime(pd_from_csv.timestamp)

        assert_frame_equal(pd_flights, pd_from_csv)

        # Now read the csv to an index
        now_millis = int(round(time.time() * 1000))

        test_index = FLIGHTS_INDEX_NAME + "." + str(now_millis)

        ed_flights_from_csv = ed.csv_to_eland(
            results_file,
            ES_TEST_CLIENT,
            test_index,
            index_col=0,
            es_refresh=True,
            es_type_overrides={
                "OriginLocation": "geo_point",
                "DestLocation": "geo_point",
            },
            converters={
                "DestLocation": lambda x: ast.literal_eval(x),
                "OriginLocation": lambda x: ast.literal_eval(x),
            },
        )
        pd_flights_from_csv = ed.eland_to_pandas(ed_flights_from_csv)

        # TODO - there is a 'bug' where the Elasticsearch index returns data in a different order to the CSV
        print(ed_flights_from_csv.head())
        print(pd_flights_from_csv.head())

        # clean up index
        ES_TEST_CLIENT.indices.delete(test_index)
示例#12
0
    def test_pandas_to_eland_ignore_index(self):
        df = pd.DataFrame(
            data={
                "A": np.random.rand(3),
                "B": 1,
                "C": "foo",
                "D": pd.Timestamp("20190102"),
                "E": [1.0, 2.0, 3.0],
                "F": False,
                "G": [1, 2, 3],
                "H": "Long text",  # text
                "I": "52.36,4.83",  # geo point
            },
            index=["0", "1", "2"],
        )

        # Now create index
        index_name = "test_pandas_to_eland_ignore_index"

        ed_df = ed.pandas_to_eland(
            df,
            ES_TEST_CLIENT,
            index_name,
            es_if_exists="replace",
            es_refresh=True,
            use_pandas_index_for_es_ids=False,
            es_type_overrides={
                "H": "text",
                "I": "geo_point"
            },
        )

        # Check types
        expected_mapping = {
            "test_pandas_to_eland_ignore_index": {
                "mappings": {
                    "properties": {
                        "A": {
                            "type": "double"
                        },
                        "B": {
                            "type": "long"
                        },
                        "C": {
                            "type": "keyword"
                        },
                        "D": {
                            "type": "date"
                        },
                        "E": {
                            "type": "double"
                        },
                        "F": {
                            "type": "boolean"
                        },
                        "G": {
                            "type": "long"
                        },
                        "H": {
                            "type": "text"
                        },
                        "I": {
                            "type": "geo_point"
                        },
                    }
                }
            }
        }

        mapping = ES_TEST_CLIENT.indices.get_mapping(index_name)

        assert expected_mapping == mapping

        # Convert back to pandas and compare with original
        pd_df = ed.eland_to_pandas(ed_df)

        # Compare values excluding index
        assert df.values.all() == pd_df.values.all()

        # Ensure that index is populated by ES.
        assert not (df.index == pd_df.index).any()

        ES_TEST_CLIENT.indices.delete(index=index_name)
示例#13
0
 def test_eland_to_pandas_performance(self):
     # TODO quantify this
     ed.eland_to_pandas(self.ed_flights(), show_progress=True)