예제 #1
0
    def test_es_if_exists_append_mapping_mismatch(self):
        df1 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
        )

        with pytest.raises(ValueError) as e:
            pandas_to_eland(
                pd_df2,
                es_client=ES_TEST_CLIENT,
                es_dest_index="test-index",
                es_if_exists="append",
            )

        assert str(e.value) == (
            "DataFrame dtypes and Elasticsearch index mapping aren't compatible:\n"
            "- 'b' is missing from DataFrame columns\n"
            "- 'c' is missing from DataFrame columns\n"
            "- 'd' is missing from DataFrame columns\n"
            "- 'Z' is missing from ES index mapping\n"
            "- 'a' column type ('keyword') not compatible with ES index mapping type ('long')"
        )
        # Assert that the index isn't modified
        assert_pandas_eland_frame_equal(pd_df, df1)
예제 #2
0
    def test_es_if_exists_replace(self):
        # Assert that 'replace' allows for creation
        df1 = pandas_to_eland(
            pd_df2,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="replace",
            es_refresh=True,
        ).to_pandas()
        assert_frame_equal(pd_df2, df1)

        # Assert that 'replace' will replace existing mapping and entries
        df2 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="replace",
            es_refresh=True,
        )
        assert_pandas_eland_frame_equal(pd_df, df2)

        df3 = pandas_to_eland(
            pd_df2,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="replace",
            es_refresh=True,
        ).to_pandas()
        assert_frame_equal(df1, df3)
예제 #3
0
    def test_es_if_exists_append_es_type_coerce_error(self):
        df1 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
            es_type_overrides={"a": "byte"},
        )
        assert_pandas_eland_frame_equal(pd_df, df1)

        pd_df_short = pd.DataFrame(
            {
                "a": [128],  # This value is too large for 'byte'
                "b": [-1.0],
                "c": ["A"],
                "d": [dt],
            },
            index=["3"],
        )

        with pytest.raises(BulkIndexError) as e:
            pandas_to_eland(
                pd_df_short,
                es_client=ES_TEST_CLIENT,
                es_dest_index="test-index",
                es_if_exists="append",
            )

        # Assert that the value 128 caused the index error
        assert "Value [128] is out of range for a byte" in str(e.value)
예제 #4
0
    def test_es_type_override_error(self):

        df = self.pd_flights().filter([
            "AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp",
            "DestCountry"
        ])

        index_name = "test_es_type_override"

        match = "'DistanceKilometers', 'DistanceMiles' column(s) not in given dataframe"
        with pytest.raises(KeyError) as e:
            ed.pandas_to_eland(
                df,
                ES_TEST_CLIENT,
                index_name,
                es_if_exists="replace",
                es_refresh=True,
                use_pandas_index_for_es_ids=False,
                es_type_overrides={
                    "AvgTicketPrice": "long",
                    "DistanceKilometers": "text",
                    "DistanceMiles": "text",
                },
            )
            assert str(e.value) == match
            ES_TEST_CLIENT.indices.delete(index=index_name)
예제 #5
0
    def test_es_if_exists_fail(self):
        pandas_to_eland(pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index")

        with pytest.raises(ValueError) as e:
            pandas_to_eland(pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index")

        assert str(e.value) == (
            "Could not create the index [test-index] because it "
            "already exists. Change the 'es_if_exists' parameter "
            "to 'append' or 'replace' data."
        )
예제 #6
0
 def save_output(self, df):
     """
     This will save data file to elasticsearch output index
     :param df: Dataframe to be saved in ES.
     :return: Pandas dataframe containing raw data
     """
     # Save result to ES
     self.log.info('Saving Output: to {}'.format(self.config.dataloader.output_index))
     df = df.set_index(df.id)
     ed.pandas_to_eland(df, 
                        self.config.dataloader.elasticsearch_host, 
                        self.config.dataloader.output_index, 
                        es_if_exists="append", 
                        es_refresh=True)
예제 #7
0
    def test_returns_eland_dataframe(self):
        df = pandas_to_eland(
            pd_df, es_client=ES_TEST_CLIENT, es_dest_index="test-index"
        )

        assert isinstance(df, DataFrame)
        assert "es_index_pattern: test-index" in df.es_info()
예제 #8
0
    def test_generate_es_mappings(self):
        df = pd.DataFrame(
            data={
                "A": np.random.rand(3),
                "B": 1,
                "C": "foo",
                "D": pd.Timestamp("20190102"),
                "E": [1.0, 2.0, 3.0],
                "F": False,
                "G": [1, 2, 3],
            },
            index=["0", "1", "2"],
        )

        expected_mappings = {
            "mappings": {
                "properties": {
                    "A": {
                        "type": "double"
                    },
                    "B": {
                        "type": "long"
                    },
                    "C": {
                        "type": "keyword"
                    },
                    "D": {
                        "type": "date"
                    },
                    "E": {
                        "type": "double"
                    },
                    "F": {
                        "type": "boolean"
                    },
                    "G": {
                        "type": "long"
                    },
                }
            }
        }

        mappings = FieldMappings._generate_es_mappings(df)

        assert expected_mappings == mappings

        # Now create index
        index_name = "eland_test_generate_es_mappings"

        ed_df = ed.pandas_to_eland(df,
                                   ES_TEST_CLIENT,
                                   index_name,
                                   es_if_exists="replace",
                                   es_refresh=True)
        ed_df_head = ed_df.head()

        assert_pandas_eland_frame_equal(df, ed_df_head)

        ES_TEST_CLIENT.indices.delete(index=index_name)
예제 #9
0
    def test_multiitem_query(self):
        # Examples from:
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
        pd_df = pd.DataFrame(
            {
                "A": range(1, 6),
                "B": range(10, 0, -2),
                "C": range(10, 5, -1)
            },
            index=["0", "1", "2", "3", "4"],
        )
        """
        >>> pd_df
           A   B   C
        0  1  10  10
        1  2   8   9
        2  3   6   8
        3  4   4   7
        4  5   2   6
        """
        # Now create index
        index_name = "eland_test_query"

        ed_df = ed.pandas_to_eland(pd_df,
                                   ES_TEST_CLIENT,
                                   index_name,
                                   es_if_exists="replace",
                                   es_refresh=True)

        assert_pandas_eland_frame_equal(pd_df, ed_df)

        pd_df.info()
        ed_df.info()

        pd_q1 = pd_df[pd_df.A > 2]
        pd_q2 = pd_df[pd_df.A > pd_df.B]
        pd_q3 = pd_df[pd_df.B == pd_df.C]

        ed_q1 = ed_df[ed_df.A > 2]
        ed_q2 = ed_df[ed_df.A > ed_df.B]
        ed_q3 = ed_df[ed_df.B == ed_df.C]

        assert_pandas_eland_frame_equal(pd_q1, ed_q1)
        assert_pandas_eland_frame_equal(pd_q2, ed_q2)
        assert_pandas_eland_frame_equal(pd_q3, ed_q3)

        ed_q4 = ed_q1.query("B > 2")
        pd_q4 = pd_q1.query("B > 2")

        assert_pandas_eland_frame_equal(pd_q4, ed_q4)

        # Drop rows by index
        ed_q4 = ed_q4.drop(["2"])
        pd_q4 = pd_q4.drop(["2"])

        assert_pandas_eland_frame_equal(pd_q4, ed_q4)

        ES_TEST_CLIENT.indices.delete(index_name)
def build_index(filepath):
    """Load the corresponding csv file into a pandas dataframe and query each
    point against our sd_geo index appending the median lat/longs returned
    """
    df = pd.read_csv(filepath).fillna("")
    df = build_datetimes(df)

    # filters out only points with a valid address
    address_points = df.loc[(df.address_number_primary != 0)]
    address_points["location"] = address_points.apply(lambda x: get_closest_lat_long(x), axis=1)
    eland.pandas_to_eland(
        pd_df=address_points,
        es_client=es,
        es_dropna=True,
        es_dest_index=Path(filepath).stem,
        es_if_exists="replace",  # so that these items are overwriting existing records. This allows us to update existing indexes.
        es_type_overrides={"date_time": "date"},
        )
예제 #11
0
    def test_es_if_exists_append(self):
        df1 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
            # We use 'short' here specifically so that the
            # assumed type of 'long' is coerced into a 'short'
            # by append mode.
            es_type_overrides={"a": "short"},
        )
        assert_pandas_eland_frame_equal(pd_df, df1)
        assert df1.shape == (3, 4)

        pd_df2 = pd.DataFrame(
            {
                "a": [4, 5, 6],
                "b": [-1.0, -2.0, -3.0],
                "c": ["A", "B", "C"],
                "d": [dt, dt - timedelta(1), dt - timedelta(2)],
            },
            index=["3", "4", "5"],
        )
        df2 = pandas_to_eland(
            pd_df2,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
        )

        # Assert that the second pandas dataframe is actually appended
        assert df2.shape == (6, 4)
        pd_df3 = pd_df.append(pd_df2)
        assert_pandas_eland_frame_equal(pd_df3, df2)
예제 #12
0
    def test_pandas_to_eland_text_inserts_keyword(self):
        es = ES_TEST_CLIENT
        df1 = pandas_to_eland(
            pd_df,
            es_client=es,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
            es_type_overrides={
                "c": "text",
                "b": {"type": "float"},
                "d": {"type": "text"},
            },
        )
        assert es.indices.get_mapping(index="test-index") == {
            "test-index": {
                "mappings": {
                    "properties": {
                        "a": {"type": "long"},
                        "b": {"type": "float"},
                        "c": {
                            "fields": {"keyword": {"type": "keyword"}},
                            "type": "text",
                        },
                        "d": {"type": "text"},
                    }
                }
            }
        }

        # 'c' is aggregatable on 'keyword'
        assert df1.groupby("c").mean().to_dict() == {
            "a": {"A": 1.0, "B": 2.0, "C": 3.0},
            "b": {"A": 1.0, "B": 2.0, "C": 3.0},
        }

        # 'd' isn't aggregatable because it's missing the 'keyword'
        with pytest.raises(ValueError) as e:
            df1.groupby("d").mean()
        assert str(e.value) == (
            "Cannot use 'd' with groupby() because it has "
            "no aggregatable fields in Elasticsearch"
        )
예제 #13
0
    def test_pandas_to_eland_ignore_index(self):
        df = pd.DataFrame(
            data={
                "A": np.random.rand(3),
                "B": 1,
                "C": "foo",
                "D": pd.Timestamp("20190102"),
                "E": [1.0, 2.0, 3.0],
                "F": False,
                "G": [1, 2, 3],
                "H": "Long text",  # text
                "I": "52.36,4.83",  # geo point
            },
            index=["0", "1", "2"],
        )

        # Now create index
        index_name = "test_pandas_to_eland_ignore_index"

        ed_df = ed.pandas_to_eland(
            df,
            ES_TEST_CLIENT,
            index_name,
            es_if_exists="replace",
            es_refresh=True,
            use_pandas_index_for_es_ids=False,
            es_type_overrides={
                "H": "text",
                "I": "geo_point"
            },
        )

        # Check types
        expected_mapping = {
            "test_pandas_to_eland_ignore_index": {
                "mappings": {
                    "properties": {
                        "A": {
                            "type": "double"
                        },
                        "B": {
                            "type": "long"
                        },
                        "C": {
                            "type": "keyword"
                        },
                        "D": {
                            "type": "date"
                        },
                        "E": {
                            "type": "double"
                        },
                        "F": {
                            "type": "boolean"
                        },
                        "G": {
                            "type": "long"
                        },
                        "H": {
                            "type": "text"
                        },
                        "I": {
                            "type": "geo_point"
                        },
                    }
                }
            }
        }

        mapping = ES_TEST_CLIENT.indices.get_mapping(index_name)

        assert expected_mapping == mapping

        # Convert back to pandas and compare with original
        pd_df = ed.eland_to_pandas(ed_df)

        # Compare values excluding index
        assert df.values.all() == pd_df.values.all()

        # Ensure that index is populated by ES.
        assert not (df.index == pd_df.index).any()

        ES_TEST_CLIENT.indices.delete(index=index_name)