Exemplo n.º 1
0
    def testSimple(self):
        if not self.canRun:
            return
        pdf = PanDatFactory(**netflowSchema())
        _dat = netflowPandasData()
        dat = pdf.PanDat(**{t:getattr(_dat, t) for t in pdf.all_tables})
        self.assertTrue(pdf.good_pan_dat_object(dat))

        dat2 = pdf.copy_pan_dat(dat)
        self.assertTrue(pdf._same_data(dat, dat2))
        self.assertTrue(pdf.good_pan_dat_object(dat2))
        delattr(dat2, "nodes")
        msg = []
        self.assertFalse(pdf.good_pan_dat_object(dat2, msg.append))
        self.assertTrue(msg[-1] == "nodes not an attribute.")

        dat3 = pdf.copy_pan_dat(dat)
        dat3.cost.drop("commodity", axis=1, inplace=True)
        self.assertFalse(pdf.good_pan_dat_object(dat3, msg.append))
        self.assertTrue("The following are (table, field) pairs missing from the data" in msg[-1])

        dat4 = pdf.copy_pan_dat(dat)
        dat4.cost["cost"] += 1
        self.assertFalse(pdf._same_data(dat, dat4))

        pdf2 = PanDatFactory(**{t:'*' for t in pdf.all_tables})
        dat5 = pdf2.copy_pan_dat(dat)
        self.assertTrue(pdf._same_data(dat, dat5))
        self.assertTrue(pdf2._same_data(dat, dat5))
        dat.commodities = dat.commodities.append(dat.commodities[dat.commodities["name"] == "Pencils"])
        dat.arcs = dat.arcs.append(dat.arcs[dat.arcs["destination"] == "Boston"])
        self.assertFalse(pdf2._same_data(dat, dat5))
        self.assertFalse(pdf._same_data(dat, dat5))
Exemplo n.º 2
0
    def testDataTypes_two(self):
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**tdf.schema())

        def makeIt():
            rtn = tdf.TicDat()
            rtn.foods["a"] = 12
            rtn.foods["b"] = None
            rtn.foods[None] = 101
            rtn.categories["1"] = {"maxNutrition": 100, "minNutrition": 40}
            rtn.categories["2"] = [10, 20]
            for f, p in itertools.product(rtn.foods, rtn.categories):
                rtn.nutritionQuantities[f, p] = 5
            rtn.nutritionQuantities['a', 2] = 12
            return tdf.copy_to_pandas(rtn, drop_pk_columns=False)

        dat = makeIt()
        errs = pdf.find_data_type_failures(dat)
        self.assertTrue(len(errs) == 2 and not pdf.find_duplicates(dat))
        dat_copied = pdf.copy_pan_dat(dat)
        pdf.replace_data_type_failures(dat)
        self.assertTrue(pdf._same_data(dat, dat_copied, epsilon=0.00001))
        pdf2 = pdf.clone()
        pdf2.set_default_value("foods", "name", "a")
        pdf2.set_default_value("nutritionQuantities", "food", "a")
        pdf2.replace_data_type_failures(dat_copied)
        self.assertFalse(pdf._same_data(dat, dat_copied, epsilon=0.00001))
        self.assertFalse(pdf.find_data_type_failures(dat_copied))
        dups = pdf.find_duplicates(dat_copied)
        self.assertTrue(
            len(dups) == 2 and len(dups["foods"]) == 1
            and len(dups["nutritionQuantities"]) == 2)

        from pandas import isnull

        def noneify(iter_of_tuples):
            return {
                tuple(None if isnull(_) else _ for _ in tuple_)
                for tuple_ in iter_of_tuples
            }

        self.assertTrue(
            noneify(errs['nutritionQuantities', 'food'].itertuples(
                index=False)) == {(None, "1", 5), (None, "2", 5)})
        self.assertTrue(
            noneify(errs['foods',
                         'name'].itertuples(index=False)) == {(None, 101)})
        pdf = PanDatFactory(**tdf.schema())
        pdf.set_data_type("foods", "name", nullable=True, strings_allowed='*')
        pdf.set_data_type("nutritionQuantities",
                          "food",
                          nullable=True,
                          strings_allowed='*')
        self.assertFalse(pdf.find_data_type_failures(dat))
        pdf.set_data_type("foods", "cost", nullable=False)
        errs = pdf.find_data_type_failures(dat)
        self.assertTrue(len(errs) == 1)
        self.assertTrue(
            noneify(errs['foods',
                         'cost'].itertuples(index=False)) == {('b', None)})
Exemplo n.º 3
0
    def testXlsSpacey(self):
        if not self.can_run:
            return

        tdf = TicDatFactory(**spacesSchema())
        pdf = PanDatFactory(**spacesSchema())
        ticDat = tdf.TicDat(**spacesData())
        panDat = pan_dat_maker(spacesSchema(), ticDat)
        ext = ".xlsx"
        filePath = os.path.join(_scratchDir, "spaces_2%s" % ext)
        pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True)
        panDat2 = pdf.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext)
        pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True)
        panDat2 = pdf.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
Exemplo n.º 4
0
    def testDietOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data, activeEnabled in list(
                itertools.product(*(([True, False], ) * 3))):
            tdf = TicDatFactory(**dietSchema())
            ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
            inputset = create_inputset_mock(tdf, ticDat, hack, activeEnabled)

            pdf = PanDatFactory(**dietSchema())
            panDat = pdf.opalytics.create_pan_dat(inputset)
            self.assertFalse(pdf.find_duplicates(panDat))
            ticDat2 = pdf.copy_to_tic_dat(panDat)
            self.assertTrue(tdf._same_data(ticDat, ticDat2))

            tdf2 = TicDatFactory(
                **{
                    k: [pks, list(dfs) + ["dmy"]]
                    for k, (pks, dfs) in tdf.schema().items()
                })
            _dat = tdf2.copy_tic_dat(ticDat)
            panDat = pdf.opalytics.create_pan_dat(
                create_inputset_mock(tdf2, _dat, hack))

            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            pdf2 = PanDatFactory(**tdf2.schema())
            ex = self.firesException(lambda: pdf2.opalytics.create_pan_dat(
                inputset, raw_data=raw_data))
            self.assertTrue(
                all(_ in ex for _ in ["(table, field) pairs missing"] +
                    ["'%s', 'dmy'" % _ for _ in pdf2.all_tables]))
Exemplo n.º 5
0
    def testVariousCoverages(self):
        pdf = PanDatFactory(**dietSchema())
        _d = dict(categories={
            "minNutrition": 0,
            "maxNutrition": float("inf")
        },
                  foods={"cost": 0},
                  nutritionQuantities={"qty": 0})
        pdf.set_default_values(**_d)
        self.assertTrue(pdf._default_values == _d)
        pdf = PanDatFactory(**netflowSchema())
        addNetflowForeignKeys(pdf)
        pdf.clear_foreign_keys("arcs")
        self.assertTrue({_[0]
                         for _ in pdf._foreign_keys} == {"cost", "inflow"})

        pdf.add_data_row_predicate("arcs", lambda row: True)
        pdf.add_data_row_predicate("arcs", lambda row: True, "dummy")
        pdf.add_data_row_predicate("arcs", None, 0)
        pdf = pdf.clone()
        self.assertTrue(set(pdf._data_row_predicates["arcs"]) == {"dummy"})
        pdf = PanDatFactory(pdf_table_one=[["A Field"], []],
                            pdf_table_two=[["B Field"], []],
                            pdf_table_three=[["C Field"], []])
        pdf.add_foreign_key("pdf_table_one", "pdf_table_two",
                            ["A Field", "B Field"])
        pdf.add_foreign_key("pdf_table_two", "pdf_table_three",
                            ["B Field", "C Field"])
        pdf.add_foreign_key("pdf_table_three", "pdf_table_one",
                            ["C Field", "A Field"])
Exemplo n.º 6
0
    def testSqlSimple(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "diet.db")
        pdf.sql.write_file(panDat, filePath)
        sqlPanDat = pdf.sql.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, sqlPanDat))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        pdf2.sql.write_file(panDat, filePath)
        sqlPanDat = pdf2.sql.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, sqlPanDat))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "netflow.db")
        pdf.sql.write_file(panDat, filePath)
        panDat2 = pdf.sql.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        sqlPanDat = pdf2.sql.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, sqlPanDat))
Exemplo n.º 7
0
    def testXToManyTwo(self):
        input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]],
                                     child_one=[["F1", "F2", "F3"], []],
                                     child_two=[["F1", "F2"], ["F3"]],
                                     child_three=[[], ["F1", "F2", "F3"]])
        for t in ["child_one", "child_two", "child_three"]:
            input_schema.add_foreign_key(t, "parent",
                                         [["F1"] * 2, ["F2"] * 2, ["F3"] * 2])
        self.assertTrue({fk.cardinality
                         for fk in input_schema.foreign_keys} ==
                        {"one-to-one", "many-to-one"})

        rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]]
        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(parent=rows,
                         child_one=rows,
                         child_two=rows,
                         child_three=rows)
        self.assertTrue(
            all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables))
        orig_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.child_one[1, 2, 4] = {}
        dat.child_two[1, 2.2] = 3
        dat.child_three.append([1, 2, 4])
        new_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        self.assertTrue(len(fk_fails) == 3)
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))

        input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]],
                                     child_one=[["F1", "F2", "F3"], []],
                                     child_two=[["F1", "F2"], ["F3"]],
                                     child_three=[[], ["F1", "F2", "F3"]])
        for t in ["child_one", "child_two", "child_three"]:
            input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2])
        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(parent=rows,
                         child_one=rows,
                         child_two=rows,
                         child_three=rows)
        self.assertTrue(
            all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables))
        orig_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.child_one[1, 2, 4] = {}
        dat.child_two[1, 2.2] = 4
        dat.child_three.append([1, 2, 4])
        new_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertTrue(
            len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3)
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
Exemplo n.º 8
0
 def testDietWithInfFlagging(self):
     diet_pdf = PanDatFactory(**dietSchema())
     addDietDataTypes(diet_pdf)
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()),
                              drop_pk_columns=False)
     diet_pdf.set_infinity_io_flag(999999999)
     core_path = os.path.join(_scratchDir, "diet_with_inf_flagging")
     diet_pdf.sql.write_file(dat, core_path + ".db")
     diet_pdf.csv.write_directory(dat, core_path + "_csv")
     diet_pdf.json.write_file(dat, core_path + ".json")
     diet_pdf.xls.write_file(dat, core_path + ".xlsx")
     for attr, f in [["sql", core_path + ".db"],
                     ["csv", core_path + "_csv"],
                     ["json", core_path + ".json"],
                     ["xls", core_path + ".xlsx"]]:
         dat_1 = getattr(diet_pdf, attr).create_pan_dat(f)
         self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = diet_pdf.clone()
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = PanDatFactory(**diet_pdf.schema())
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5))
         protein = dat_1.categories["name"] == "protein"
         self.assertTrue(
             list(dat_1.categories[protein]["maxNutrition"])[0] ==
             999999999)
         dat_1.categories.loc[protein, "maxNutrition"] = float("inf")
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
Exemplo n.º 9
0
    def testDictConstructions(self):
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        panDat2 = pdf.PanDat(**{t:getattr(panDat, t).to_dict() for t in pdf.all_tables})
        panDat3 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables})
        panDat3_1 = pdf.PanDat(**{t:list(map(list, getattr(panDat, t).itertuples(index=False)))
                                  for t in pdf.all_tables})

        self.assertTrue(all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3, panDat3_1]))
        panDat.foods["extra"] = 12
        panDat4 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables})
        self.assertTrue(pdf._same_data(panDat, panDat4))
        self.assertTrue(set(panDat4.foods["extra"]) == {12})

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        panDat2 = pdf.PanDat(**{t:getattr(panDat, t).to_dict() for t in pdf.all_tables})
        panDat3 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="records") for t in pdf.all_tables})
        self.assertTrue(all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3]))
        panDat.cost["extra"] = "boger"
        panDat4 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables})
        self.assertTrue(pdf._same_data(panDat, panDat4))
        self.assertTrue(set(panDat4.cost["extra"]) == {"boger"})
Exemplo n.º 10
0
    def testDataTypes(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [10,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5
        ticdat.nutritionQuantities['a', 2] = 12

        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        self.assertFalse(pdf.find_data_type_failures(pandat))
        pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat))
        self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001))

        pdf = PanDatFactory(**dietSchema())
        pdf.set_data_type("foods", "cost", nullable=False)
        pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
        self.assertTrue(set({(v["food"], v["category"])
                             for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                            {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})

        failed = pdf.find_data_type_failures(pandat, as_table=False)
        self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
        fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15})
        self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0})
        self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed="*")
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"])
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
        pdf.replace_data_type_failures(pandat)
        self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
Exemplo n.º 11
0
    def testIssue45(self):
        pdf = PanDatFactory(data=[["a"], ["b"]])
        tdf = TicDatFactory(**pdf.schema())
        dat_nums = tdf.copy_to_pandas(
            tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]), drop_pk_columns=False)
        dat_strs = tdf.copy_to_pandas(
            tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]),
            drop_pk_columns=False)
        files = [
            os.path.join(_scratchDir, _)
            for _ in ["dat_nums.xlsx", "dat_strs.xlsx"]
        ]
        pdf.xls.write_file(dat_nums, files[0])
        pdf.xls.write_file(dat_strs, files[1])
        dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files]
        self.assertTrue(pdf._same_data(dat_nums, dat_nums_2))
        # this is pandas pushing things to be numeric
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertTrue(pdf._same_data(dat_nums, dat_strs_2))

        pdf = PanDatFactory(data=[["a"], ["b"]])
        pdf.set_data_type("data",
                          "a",
                          number_allowed=False,
                          strings_allowed='*')
        dat_mixed = tdf.copy_to_pandas(
            tdf.TicDat(data=[["1", 2], ["3", 4], ["022", 44]]),
            drop_pk_columns=False)
        dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files]
        self.assertFalse(pdf._same_data(dat_nums, dat_nums_2))
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertFalse(pdf._same_data(dat_nums_2, dat_mixed))
        self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))

        pdf = PanDatFactory(data=[["a"], ["b"]])
        csv_dirs = [
            os.path.join(_scratchDir, _)
            for _ in ["dat_nums_csv", "dat_strs_csv"]
        ]
        pdf.csv.write_directory(dat_nums, csv_dirs[0])
        pdf.csv.write_directory(dat_strs, csv_dirs[1])
        dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs]
        self.assertTrue(pdf._same_data(dat_nums, dat_nums_2))
        # this is pandas pushing things to be numeric
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertTrue(pdf._same_data(dat_nums, dat_strs_2))
        pdf = PanDatFactory(data=[["a"], ["b"]])
        pdf.set_data_type("data",
                          "a",
                          number_allowed=False,
                          strings_allowed='*')
        dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs]
        self.assertFalse(pdf._same_data(dat_nums, dat_nums_2))
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertFalse(pdf._same_data(dat_nums_2, dat_strs_2))
        self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))
Exemplo n.º 12
0
    def testSqlSpaceyTwo(self):
        if not self.can_run:
            return
        self.assertTrue(pandatio.sql,
                        "this unit test requires SQLite installed")

        tdf = TicDatFactory(**spacesSchema())
        pdf = PanDatFactory(**spacesSchema())
        ticDat = tdf.TicDat(
            **{
                "a_table": {
                    1: [1, 2, "3"],
                    22.2: (12, 0.12, "something"),
                    0.23: (11, 12, "thirt")
                },
                "b_table": {
                    (1, 2, "foo"): 1,
                    (1012.22, 4, "0012"): 12
                },
                "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5),
                            ("another", 5, 12.5, 24))
            })
        panDat = pan_dat_maker(spacesSchema(), ticDat)
        ext = ".db"
        filePath = os.path.join(_scratchDir, "spaces_2%s" % ext)
        with pandatio.sql.connect(filePath) as con:
            pdf.sql.write_file(panDat,
                               db_file_path=None,
                               con=con,
                               case_space_table_names=True)
        with pandatio.sql.connect(filePath) as con:
            panDat2 = pdf.sql.create_pan_dat(db_file_path=None, con=con)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext)
        with pandatio.sql.connect(filePath) as con:
            pdf.sql.write_file(panDat,
                               db_file_path="",
                               con=con,
                               case_space_table_names=True)
        with pandatio.sql.connect(filePath) as con:
            panDat2 = pdf.sql.create_pan_dat(None, con)
        self.assertTrue(pdf._same_data(panDat, panDat2))
Exemplo n.º 13
0
    def testInfFlagging(self):
        pdf = PanDatFactory(table=[["field one"], ["field two"]])
        for f in ["field one", "field two"]:
            pdf.set_data_type("table", f, nullable=True)

        def make_dat(l):
            tdf = TicDatFactory(**pdf.schema())
            return tdf.copy_to_pandas(tdf.TicDat(table=l),
                                      drop_pk_columns=False)

        dat = make_dat([[None, 100], [200, 109], [0, 300], [300, None],
                        [400, 0]])
        core_path = os.path.join(_scratchDir, "non_inf_flagging")
        for attr, path in [["sql", core_path + ".db"],
                           ["csv", core_path + "_csv"],
                           ["json", core_path + ".json"],
                           ["xls", core_path + ".xlsx"]]:
            func = "write_directory" if attr == "csv" else "write_file"
            getattr(getattr(pdf, attr), func)(dat, path)
            dat_1 = getattr(pdf, attr).create_pan_dat(path)
            _ = PanDatFactory(table=[[], ["field one", "field two"]])
            self.assertTrue(
                _._same_data(dat, dat_1, nans_are_same_for_data_rows=True))

            pdf_ = PanDatFactory(table=[["field one"], ["field two"]])
            for f in ["field one", "field two"]:
                pdf_.set_data_type("table",
                                   f,
                                   max=float("inf"),
                                   inclusive_max=True)
            pdf_.set_infinity_io_flag(None)
            dat_inf = make_dat([[float("inf"), 100], [200, 109], [0, 300],
                                [300, float("inf")], [400, 0]])
            dat_1 = getattr(pdf_, attr).create_pan_dat(path)
            self.assertTrue(pdf._same_data(dat_inf, dat_1))
            getattr(getattr(pdf_, attr), func)(dat, path)
            dat_1 = getattr(pdf_, attr).create_pan_dat(path)  #
            self.assertTrue(pdf._same_data(dat_inf, dat_1))
            pdf_ = PanDatFactory(table=[["field one"], ["field two"]])
            for f in ["field one", "field two"]:
                pdf_.set_data_type("table",
                                   f,
                                   min=-float("inf"),
                                   inclusive_min=True)
            pdf_.set_infinity_io_flag(None)
            dat_1 = getattr(pdf_, attr).create_pan_dat(path)  #
            self.assertFalse(pdf._same_data(dat_inf, dat_1))
            dat_inf = make_dat([[float("-inf"), 100], [200, 109], [0, 300],
                                [300, -float("inf")], [400, 0]])
            self.assertTrue(pdf._same_data(dat_inf, dat_1))
Exemplo n.º 14
0
    def testXlsSimple(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "diet.xlsx")
        pdf.xls.write_file(panDat, filePath)
        xlsPanDat = pdf.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, xlsPanDat))

        pdf_shrunk = PanDatFactory(**{
            k: v
            for k, v in dietSchema().items() if k != "nutritionQuantities"
        })
        self.assertTrue(len(pdf_shrunk.all_tables) == len(pdf.all_tables) - 1)
        xlsPanDatShrunk = pdf_shrunk.xls.create_pan_dat(filePath)
        self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk))
        filePathShrunk = os.path.join(_scratchDir, "diet_shrunk.xlsx")
        self.assertTrue(
            self.firesException(
                lambda: pdf.xls.create_pan_dat(filePathShrunk)))
        pdf_shrunk.xls.write_file(panDat, filePathShrunk)
        xlsPanDatShrunk = pdf.xls.create_pan_dat(filePathShrunk)
        self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk))

        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        pdf2.xls.write_file(panDat, filePath)
        xlsPanDat = pdf2.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, xlsPanDat))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "netflow.xlsx")
        pdf.xls.write_file(panDat, filePath)
        panDat2 = pdf.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        xlsPanDat = pdf2.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, xlsPanDat))
Exemplo n.º 15
0
    def testCsvSimple(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "diet_csv")
        pdf.csv.write_directory(panDat, dirPath)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        panDat2 = pdf2.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "netflow_csv")
        pdf.csv.write_directory(panDat, dirPath)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        pdf2.csv.write_directory(panDat, dirPath)
        panDat2 = pdf2.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "diet_csv")
        pdf.csv.write_directory(panDat, dirPath, decimal=",")
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertFalse(pdf._same_data(panDat, panDat2))
        panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",")
        self.assertTrue(pdf._same_data(panDat, panDat2))
Exemplo n.º 16
0
 def testFindDups(self):
     pdf = PanDatFactory(**sillyMeSchema())
     tdf = TicDatFactory(
         **{
             k: [[], list(pkfs) + list(dfs)]
             for k, (pkfs, dfs) in sillyMeSchema().items()
         })
     rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]
     ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables})
     panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat))
     dups = pdf.find_duplicates(panDat)
     self.assertTrue(set(dups) == {'a'} and set(dups['a']['aField']) == {1})
     dups = pdf.find_duplicates(panDat, as_table=False, keep=False)
     self.assertTrue(
         set(dups) == {'a'} and dups['a'].value_counts()[True] == 2)
     dups = pdf.find_duplicates(panDat, as_table=False)
     self.assertTrue(
         set(dups) == {'a'} and dups['a'].value_counts()[True] == 1)
     rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 2, 3, 40)]
     ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables})
     panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat))
     dups = pdf.find_duplicates(panDat, keep=False)
     self.assertTrue(
         set(dups) == {'a', 'b'} and set(dups['a']['aField']) == {1})
     dups = pdf.find_duplicates(panDat, as_table=False, keep=False)
     self.assertTrue({k: v.value_counts()[True]
                      for k, v in dups.items()} == {
                          'a': 3,
                          'b': 2
                      })
Exemplo n.º 17
0
 def test_parameters(self):
     core_path = os.path.join(_scratchDir, "parameters")
     pdf = PanDatFactory(parameters=[["Key"], ["Value"]])
     pdf.add_parameter("Something", 100)
     pdf.add_parameter("Different",
                       'boo',
                       strings_allowed='*',
                       number_allowed=False)
     dat = TicDatFactory(**pdf.schema()).TicDat(
         parameters=[["Something", float("inf")], ["Different", "inf"]])
     dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
         dat, drop_pk_columns=False)
     for attr, path in [["sql", core_path + ".db"],
                        ["csv", core_path + "_csv"],
                        ["json", core_path + ".json"],
                        ["xls", core_path + ".xlsx"]]:
         func = "write_directory" if attr == "csv" else "write_file"
         getattr(getattr(pdf, attr), func)(dat, path)
         dat_1 = getattr(pdf, attr).create_pan_dat(path)
         self.assertTrue(pdf._same_data(dat, dat_1))
     core_path = os.path.join(_scratchDir, "parameters_two")
     dat = TicDatFactory(**pdf.schema()).TicDat(
         parameters=[["Something", float("inf")], ["Different", "05701"]])
     dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
         dat, drop_pk_columns=False)
     for attr, path in [["sql", core_path + ".db"],
                        ["csv", core_path + "_csv"],
                        ["xls", core_path + ".xlsx"],
                        ["json", core_path + ".json"]]:
         func = "write_directory" if attr == "csv" else "write_file"
         getattr(getattr(pdf, attr), func)(dat, path)
         dat_1 = getattr(pdf, attr).create_pan_dat(path)
         self.assertTrue(pdf._same_data(dat, dat_1))
Exemplo n.º 18
0
    def testDietCleaningOpalytics(self):
        sch = dietSchema()
        sch["categories"][-1].append("_active")
        tdf1 = TicDatFactory(**dietSchema())
        tdf2 = TicDatFactory(**sch)

        ticDat2 = tdf2.copy_tic_dat(dietData())
        for v in ticDat2.categories.values():
            v["_active"] = True
        ticDat2.categories["fat"]["_active"] = False
        ticDat1 = tdf1.copy_tic_dat(dietData())

        input_set = create_inputset_mock_with_active_hack(tdf2, ticDat2)
        pdf1 = PanDatFactory(**tdf1.schema())
        panDat = pdf1.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf1._same_data(pdf1.copy_to_tic_dat(panDat), ticDat1))

        panDatPurged = pdf1.opalytics.create_pan_dat(input_set)
        self.assertFalse(
            tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))

        ticDat1.categories.pop("fat")
        tdf1.remove_foreign_key_failures(ticDat1)
        self.assertTrue(
            tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))
Exemplo n.º 19
0
    def testDietCleaningOpalyticsTwo(self):
        tdf = TicDatFactory(**dietSchema())
        addDietForeignKeys(tdf)
        tdf.set_data_type("categories",
                          "maxNutrition",
                          min=66,
                          inclusive_max=True)
        ticDat = tdf.copy_tic_dat(dietData())

        input_set = create_inputset_mock(tdf, ticDat)
        pdf = PanDatFactory(**dietSchema())
        addDietForeignKeys(pdf)
        pdf.set_data_type("categories",
                          "maxNutrition",
                          min=66,
                          inclusive_max=True)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Exemplo n.º 20
0
    def testDietCleaningOpalytisThree(self):
        tdf = TicDatFactory(**dietSchema())
        tdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(tdf)
        ticDat = tdf.copy_tic_dat(dietData())

        pdf = PanDatFactory(**tdf.schema())
        pdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(pdf)

        input_set = create_inputset_mock(tdf, ticDat)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Exemplo n.º 21
0
 def testDupsOpalytics(self):
     if not self.can_run:
         return
     for hack in [True, False]:
         tdf = TicDatFactory(one=[["a"], ["b", "c"]],
                             two=[["a", "b"], ["c"]],
                             three=[["a", "b", "c"], []])
         tdf2 = TicDatFactory(
             **{t: [[], ["a", "b", "c"]]
                for t in tdf.all_tables})
         td = tdf2.TicDat(
             **{
                 t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2],
                     ["new", 1, 2]]
                 for t in tdf.all_tables
             })
         inputset = create_inputset_mock(tdf2, td, hack)
         pdf = PanDatFactory(**tdf.schema())
         panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=True)
         self.assertTrue(
             all(len(getattr(panDat, t)) == 6 for t in tdf.all_tables))
         panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=False)
         self.assertTrue(
             all(len(getattr(panDat, t)) < 6 for t in tdf.all_tables))
         td_1 = tdf.TicDat(
             **{
                 t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2],
                     ["new", 1, 2]]
                 for t in tdf.all_tables
             })
         td_2 = pdf.copy_to_tic_dat(panDat)
         self.assertTrue(
             all(
                 set(getattr(td_1, t)) == set(getattr(td_2, t))
                 for t in tdf.all_tables))
Exemplo n.º 22
0
    def testSillyCleaningOpalyticsOne(self):
        tdf = TicDatFactory(**sillyMeSchema())
        tdf.set_data_type("c",
                          "cData4",
                          number_allowed=False,
                          strings_allowed=['d'])
        ticDat = tdf.TicDat(**sillyMeData())

        input_set = create_inputset_mock(tdf, ticDat)

        pdf = PanDatFactory(**sillyMeSchema())
        pdf.set_data_type("c",
                          "cData4",
                          number_allowed=False,
                          strings_allowed=['d'])

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.c.pop()
        ticDat.c.pop(0)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
Exemplo n.º 23
0
 def test_data_row_max_failures(self):
     pdf = PanDatFactory(table_one=[["Field"], []],
                         table_two=[[], ["Field"]])
     for t in ["table_one", "table_two"]:
         pdf.set_data_type(t, "Field")
     for table, dts in pdf.data_types.items():
         for field, dt in dts.items():
             if table == "table_one":
                 pdf.add_data_row_predicate(
                     table, lambda row: dt.valid_data(row["Field"]))
             else:
                 pdf.add_data_row_predicate(
                     table,
                     lambda row: True
                     if not dt.valid_data(row["Field"]) else "Oops",
                     predicate_failure_response="Error Message")
     dat = pdf.PanDat(table_one=DataFrame(
         {"Field": list(range(1, 11)) + [-_ for _ in range(1, 11)]}),
                      table_two=DataFrame(
                          {"Field": [10.1] * 10 + [-2] * 10}))
     errs = pdf.find_data_row_failures(dat)
     self.assertTrue(
         len(errs) == 2 and all(len(_) == 10 for _ in errs.values()))
     errs = pdf.find_data_row_failures(dat, max_failures=11)
     self.assertTrue(len(errs) == 2)
     self.assertTrue(
         any(len(_) == 10 for _ in errs.values())
         and any(len(_) == 1 for _ in errs.values()))
     errs = pdf.find_data_row_failures(dat, max_failures=10)
     self.assertTrue(
         len(errs) == 1 and all(len(_) == 10 for _ in errs.values()))
     errs = pdf.find_data_row_failures(dat, max_failures=9)
     self.assertTrue(
         len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
Exemplo n.º 24
0
    def test_nullables(self):
        core_path = os.path.join(_scratchDir, "nullables")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]])
        pdf.set_data_type("table_with_stuffs", "field one")
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          number_allowed=False,
                          strings_allowed='*',
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[101, "022"], [202, None], [303, "111"]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(pdf.find_data_type_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            f_or_d = "directory" if attr == "csv" else "file"
            write_func, write_kwargs = utils._get_write_function_and_kwargs(
                pdf, path, f_or_d)
            write_func(dat, path, **write_kwargs)
            dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d,
                                          False)
            self.assertTrue(
                pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
Exemplo n.º 25
0
 def perform_predicate_checks(sch):
     pdf = PanDatFactory(**sch)
     pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
     good_qty = lambda qty : 5 < qty <= 12
     pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
     pdf.add_data_row_predicate("categories",
                                lambda row: row["maxNutrition"] >= row["minNutrition"],
                                "minmax")
     pdf2 = PanDatFactory(**sch)
     def make_error_message_predicate(f, name):
         def error_message_predicate(row):
             rtn = f(row)
             if rtn:
                 return True
             return f"{name} failed!"
         return error_message_predicate
     for t, preds in pdf._data_row_predicates.items():
         for p_name, rpi in preds.items():
             pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name),
                                         predicate_name=p_name, predicate_failure_response="Error Message")
     failed = pdf.find_data_row_failures(pandat)
     failed2 = pdf2.find_data_row_failures(pandat)
     self.assertTrue(set(failed) == set(failed2) ==  {('foods', 'cost'),
                                     ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
     self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'})
     for f in [failed, failed2]:
         self.assertTrue(set({(v["food"], v["category"])
                              for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                             {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
         self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'})
     for t, n in failed2:
         self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'})
     for _pdf in [pdf, pdf2]:
         failed = _pdf.find_data_row_failures(pandat, as_table=False)
         self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
         ex = []
         try:
             _pdf.find_data_row_failures(pandat_2)
         except Exception as e:
             ex[:] = [str(e.__class__)]
         self.assertTrue("TypeError" in ex[0])
         failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
         self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})
     failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
     df = failed['categories', 'minmax']
     err_str = list(df[df['name'] == '3']["Error Message"])[0]
     self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")
Exemplo n.º 26
0
 def test_missing_tables(self):
     core_path = os.path.join(_scratchDir, "missing_tables")
     pdf_1 = PanDatFactory(this=[["Something"], ["Another"]])
     pdf_2 = PanDatFactory(
         **dict(pdf_1.schema(), that=[["What", "Ever"], []]))
     dat = pdf_1.PanDat(this={
         "Something": ["a", "b", "c"],
         "Another": [2, 3, 5]
     })
     for attr, path in [["sql", core_path + ".db"],
                        ["csv", core_path + "_csv"],
                        ["json", core_path + ".json"],
                        ["xls", core_path + ".xlsx"]]:
         func = "write_directory" if attr == "csv" else "write_file"
         getattr(getattr(pdf_1, attr), func)(dat, path)
         dat_1 = getattr(pdf_2, attr).create_pan_dat(path)
         self.assertTrue(pdf_1._same_data(dat, dat_1))
Exemplo n.º 27
0
 def make_pdf():
     pdf = PanDatFactory(data_table = [["a"], ["b", "c"]],
                         parameters = [["a"], ["b"]])
     pdf.add_parameter("Something", 100, max=100, inclusive_max=True)
     pdf.add_parameter("Another thing", 5, must_be_int=True)
     pdf.add_parameter("Untyped thing", "whatever", enforce_type_rules=False)
     pdf.add_parameter("Last", 'boo', number_allowed=False, strings_allowed='*')
     return PanDatFactory.create_from_full_schema(pdf.schema(True))
Exemplo n.º 28
0
    def test_datetime(self):
        core_path = os.path.join(_scratchDir, "parameters")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        pdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        pdf.add_parameter("p2", None, datetime=True, nullable=True)
        pdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None],
                               [
                                   datetime.datetime.now(),
                                   dateutil.parser.parse("Sept 11 2011")
                               ]],
            parameters=[["p1", "7/11/1911"], ["p2", None]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(
            pdf.find_data_type_failures(dat)
            or pdf.find_data_row_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            func = "write_directory" if attr == "csv" else "write_file"
            getattr(getattr(pdf, attr), func)(dat, path)
            dat_1 = getattr(pdf, attr).create_pan_dat(path)
            self.assertFalse(pdf._same_data(dat, dat_1))
            self.assertFalse(
                pdf.find_data_type_failures(dat_1)
                or pdf.find_data_row_failures(dat_1))
            dat_1 = pdf.copy_to_tic_dat(dat_1)
            self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'})
            self.assertTrue(
                isinstance(dat_1.parameters["p1"]["b"],
                           (datetime.datetime, numpy.datetime64))
                and not pd.isnull(dat_1.parameters["p1"]["b"]))
            self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"]))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime,
                                   numpy.datetime64)) and not pd.isnull(_)
                    for _ in dat_1.table_with_stuffs))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime, numpy.datetime64))
                    or _ is None or utils.safe_apply(math.isnan)(_)
                    for v in dat_1.table_with_stuffs.values()
                    for _ in v.values()))
            self.assertTrue({
                pd.isnull(_)
                for v in dat_1.table_with_stuffs.values() for _ in v.values()
            } == {True, False})
Exemplo n.º 29
0
    def testNetflowOpalytics(self):
        if not self.can_run:
            return
        for hack, raw_data in list(itertools.product(*(([True, False], ) *
                                                       2))):
            tdf = TicDatFactory(**netflowSchema())
            ticDat = tdf.copy_tic_dat(netflowData())
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))

            ticDat.nodes[12] = {}
            inputset = create_inputset_mock(tdf, ticDat, hack)
            pdf = PanDatFactory(**tdf.schema())
            panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data)
            self.assertTrue(tdf._same_data(ticDat,
                                           pdf.copy_to_tic_dat(panDat)))
Exemplo n.º 30
0
    def testCsvSpacey(self):
        if not self.can_run:
            return
        self.assertTrue(pandatio.sql,
                        "this unit test requires SQLite installed")

        tdf = TicDatFactory(**spacesSchema())
        pdf = PanDatFactory(**spacesSchema())
        ticDat = tdf.TicDat(
            **{
                "a_table": {
                    1: [1, 2, "3"],
                    22.2: (12, 0.12, "something"),
                    0.23: (11, 12, "thirt")
                },
                "b_table": {
                    (1, 2, "foo"): 1,
                    (1012.22, 4, "0012"): 12
                },
                "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5),
                            ("another", 5, 12.5, 24))
            })
        panDat = pan_dat_maker(spacesSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "spaces_2_csv")
        pdf.csv.write_directory(panDat, dirPath, case_space_table_names=True)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "spaces_2_2_csv")
        pdf.csv.write_directory(panDat,
                                dirPath,
                                case_space_table_names=True,
                                sep=":")
        panDat2 = pdf.csv.create_pan_dat(dirPath, sep=":")
        self.assertTrue(pdf._same_data(panDat, panDat2))