def test_get_name_from_dataframes(self):
        file_names = ["example1.xlsx", "example2.xlsx"]
        dataframe_list = [
            pd.DataFrame(
                data={
                    "A": ["  kek, \n LLC\n\t\r  ", "lol singapore, llc"],
                    "\n\t\r\nB\n": ["b", "b"]
                }),
            pd.DataFrame(
                data={
                    "A": ["  lel, \n LLC\n\t\r  ", "kok london, llc"],
                    "\n\t gasf\n": ["b", "b"]
                }),
        ]

        list_expected = [
            CompanyNameWithFileName(file_names[0], "kek, llc"),
            CompanyNameWithFileName(file_names[0], "lol singapore, llc"),
            CompanyNameWithFileName(file_names[1], "lel, llc"),
            CompanyNameWithFileName(file_names[1], "kok london, llc")
        ]
        processer = DataFramePreprocessor("A")
        list_actual = processer.get_company_names_from_dataframes(
            dataframe_list, file_names)
        assert list_expected == list_actual
def test_mapper_to_df():
    name_to_group = {
        CompanyNameWithFileName("a", "b"): 1,
        CompanyNameWithFileName("c", "z"): 0,
        CompanyNameWithFileName("c", "b"): 1,
        CompanyNameWithFileName("c", "ze"): 2
    }
    actual = CompanyMapper.create_dataframe_from_mapper(
        CompanyMapper(name_to_group))
    rows = [["a", "b", 1], ["c", "b", 1], ["c", "z", 0], ["c", "ze", 2]]
    expected = DataFrame(data=rows, columns=CompanyMapper.COLUMN_NAMES)
    assert actual.equals(expected)
def test_df_to_mapper():
    rows = [["dasd", "a", "B  ", 1], ["dasd", "c", "B \n", 1],
            ["dasd", "c", "z  ", 0], ["dasd", "c", "  ze", 2]]
    columns = ["dasd", "file_name", "Company_name\t", "group_Id"]
    df = DataFrame(data=rows, columns=columns)
    actual = CompanyMapper.create_mapper_from_dataframe(df).name_to_group
    expected = {
        CompanyNameWithFileName("a", "b"): 1,
        CompanyNameWithFileName("c", "b"): 1,
        CompanyNameWithFileName("c", "z"): 0,
        CompanyNameWithFileName("c", "ze"): 2
    }
    assert actual == expected
def test_get_dict_to_others():
    clusterizator = JacardDistanceClusterization(names_1)
    expected = {
        "a": {
            CompanyNameWithFileName("b", "comp_b_0, llc"),
            CompanyNameWithFileName("b", "comp_b_1, llc"),
            CompanyNameWithFileName("c", "comp_c_0, llc")
        },
        "b": {CompanyNameWithFileName("c", "comp_c_0, llc")},
        "c": set()
    }
    actual = clusterizator.get_dict_file_name_to_other_company_names()
    for key in actual.keys():
        assert set(actual[key]) == expected[key]
 def test_get_name_from_dataframe(self):
     file_name = "example.xlsx"
     data_frame = pd.DataFrame(
         data={
             "A": ["  kek, \n LLC\n\t\r  ", "lol singapore, llc"],
             "\n\t\r\nB\n": ["b", "b"]
         })
     list_expected = [
         CompanyNameWithFileName(file_name, "kek, llc"),
         CompanyNameWithFileName(file_name, "lol singapore, llc")
     ]
     processer = DataFramePreprocessor("A")
     list_actual = processer.get_company_names_from_dataframe(
         data_frame, file_name)
     assert list_expected == list_actual
 def get_company_names_from_dataframe(self, dataframe: pd.DataFrame,
                                      file_name: str):
     dataframe_copy = dataframe.copy(deep=True)
     self._standartize_columns_names(dataframe_copy)
     self._check_key_existence(dataframe_copy)
     self._check_names_nonempty(dataframe_copy)
     return list(
         map(lambda name: CompanyNameWithFileName(file_name, name),
             map(Utils.normalize_string, dataframe_copy[self.__key_name])))
Пример #7
0
 def create_mapper_from_dataframe(dataframe: DataFrame):
     dataframe.columns = Index(
         map(Utils.normalize_string, dataframe.columns))
     if not set(CompanyMapper.COLUMN_NAMES).issubset(set(
             dataframe.columns)):
         raise AssertionError("necessary columns do not exist")
     dataframe = dataframe[CompanyMapper.COLUMN_NAMES]
     name_to_group: Dict[CompanyNameWithFileName, int] = {}
     for _, row in dataframe.iterrows():
         name = CompanyNameWithFileName(
             row[CompanyMapper.COLUMN_NAMES[0]],
             Utils.normalize_string(row[CompanyMapper.COLUMN_NAMES[1]]))
         name_to_group[name] = row[CompanyMapper.COLUMN_NAMES[2]]
     return CompanyMapper(name_to_group)
def test_get_group_to_names():
    name_to_group = {
        CompanyNameWithFileName("a", "b"): 1,
        CompanyNameWithFileName("c", "b"): 1,
        CompanyNameWithFileName("c", "z"): 0,
        CompanyNameWithFileName("c", "ze"): 2
    }
    expected = {
        0: {CompanyNameWithFileName("c", "z")},
        1:
        {CompanyNameWithFileName("a", "b"),
         CompanyNameWithFileName("c", "b")},
        2: {CompanyNameWithFileName("c", "ze")}
    }
    actual = CompanyMapper.get_group_to_names(name_to_group)
    for group in actual.keys():
        actual[group] = set(actual[group])
    assert actual == expected
def test_get_indexes_of_common_companies():
    name_to_group = {
        CompanyNameWithFileName("a", "b"): 1,
        CompanyNameWithFileName("c", "b"): 1,
        CompanyNameWithFileName("c", "z"): 0,
        CompanyNameWithFileName("c", "ze"): 2,
        CompanyNameWithFileName("d", "ze zE"): 3,
        CompanyNameWithFileName("d", "b"): 1,
        CompanyNameWithFileName("d", "z"): 0
    }
    mapper = CompanyMapper(name_to_group)
    file_names_to_series = {
        "d": Series(("ze ze", "z", "b")),
        "a": Series(("b", )),
        "c": Series(("ze", "b", "z"))
    }
    actual = mapper.get_indexes_of_common_companies(file_names_to_series)
    expected = [[None, 2, 1], [0, 1, 2]]
    assert actual == expected
from companies_union.clusterization.jacard_distance_clusterization import JacardDistanceClusterization
from companies_union.company_name import CompanyNameWithFileName

import pytest

names_1 = [
    CompanyNameWithFileName("a", "comp_a_0, llc"),
    CompanyNameWithFileName("b", "comp_b_0, llc"),
    CompanyNameWithFileName("a", "comp_a_1, llc"),
    CompanyNameWithFileName("b", "comp_b_1, llc"),
    CompanyNameWithFileName("c", "comp_c_0, llc"),
]

expected_groups_1 = [0, 2, 1, 3, 4]

names_2 = [
    CompanyNameWithFileName("a", "a b c d"),
    CompanyNameWithFileName("b", "a"),
    CompanyNameWithFileName("a", "a b c e f, llc"),
    CompanyNameWithFileName("b", "a b c e, llc"),
    CompanyNameWithFileName("c", "a b c d, llc"),
]

expected_groups_2 = [0, 2, 1, 1, 0]

names_3 = [
    CompanyNameWithFileName("a", "a b c d"),
    CompanyNameWithFileName("b", "a b d"),
    CompanyNameWithFileName("b", "a b c"),
    CompanyNameWithFileName("a", "a b c e f, llc"),
    CompanyNameWithFileName("b", "a b c e, llc"),
 def test_equals(self):
     first = ["das", "a b, llc"]
     second = ["das", "a B, \n lLc \n"]
     assert CompanyNameWithFileName(*first) == CompanyNameWithFileName(
         *second)
 def test_distances(self):
     first = CompanyNameWithFileName("a", "a b c e f, llc")
     second = CompanyNameWithFileName("a", "a b c d, llc")
     assert first.distance(second) >= 1
 def test_jacard_distances(self):
     first = ["das", "a e"]
     second = ["das", "a b c d e"]
     assert CompanyNameWithFileName(*first).jacard_distance(
         CompanyNameWithFileName(*second)) == 0.6
 def test_tokens(self):
     first = ["das", "a b, llc"]
     assert CompanyNameWithFileName(*first).tokens == {"a", "b"}
    }
    expected = {
        0: {CompanyNameWithFileName("c", "z")},
        1:
        {CompanyNameWithFileName("a", "b"),
         CompanyNameWithFileName("c", "b")},
        2: {CompanyNameWithFileName("c", "ze")}
    }
    actual = CompanyMapper.get_group_to_names(name_to_group)
    for group in actual.keys():
        actual[group] = set(actual[group])
    assert actual == expected


name_to_group = {
    CompanyNameWithFileName("a", "b"): 1,
    CompanyNameWithFileName("c", "b"): 1,
    CompanyNameWithFileName("c", "z"): 0,
    CompanyNameWithFileName("c", "ze"): 2,
    CompanyNameWithFileName("d", "ze zE"): 3,
    CompanyNameWithFileName("d", "b"): 1
}


@pytest.mark.parametrize("file_name,series,expected",
                         [("a", Series(("b", )), []),
                          ("c", Series(("ze", "b", "z")), [0, 2]),
                          ("d", Series(("b", "ze ze")), [1])])
def test_get_indexes_of_unique_companies(file_name, series, expected):
    mapper = CompanyMapper(name_to_group)
    actual = mapper.get_indexes_of_unique_companies(series, file_name)