Пример #1
0
 def test_parameters(self):
     core_path = os.path.join(_scratchDir, "parameters")
     pdf = PanDatFactory(parameters=[["Key"], ["Value"]])
     pdf.add_parameter("Something", 100)
     pdf.add_parameter("Different",
                       'boo',
                       strings_allowed='*',
                       number_allowed=False)
     dat = TicDatFactory(**pdf.schema()).TicDat(
         parameters=[["Something", float("inf")], ["Different", "inf"]])
     dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
         dat, drop_pk_columns=False)
     for attr, path in [["sql", core_path + ".db"],
                        ["csv", core_path + "_csv"],
                        ["json", core_path + ".json"],
                        ["xls", core_path + ".xlsx"]]:
         func = "write_directory" if attr == "csv" else "write_file"
         getattr(getattr(pdf, attr), func)(dat, path)
         dat_1 = getattr(pdf, attr).create_pan_dat(path)
         self.assertTrue(pdf._same_data(dat, dat_1))
     core_path = os.path.join(_scratchDir, "parameters_two")
     dat = TicDatFactory(**pdf.schema()).TicDat(
         parameters=[["Something", float("inf")], ["Different", "05701"]])
     dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
         dat, drop_pk_columns=False)
     for attr, path in [["sql", core_path + ".db"],
                        ["csv", core_path + "_csv"],
                        ["xls", core_path + ".xlsx"],
                        ["json", core_path + ".json"]]:
         func = "write_directory" if attr == "csv" else "write_file"
         getattr(getattr(pdf, attr), func)(dat, path)
         dat_1 = getattr(pdf, attr).create_pan_dat(path)
         self.assertTrue(pdf._same_data(dat, dat_1))
Пример #2
0
    def test_nullables(self):
        core_path = os.path.join(_scratchDir, "nullables")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]])
        pdf.set_data_type("table_with_stuffs", "field one")
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          number_allowed=False,
                          strings_allowed='*',
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[101, "022"], [202, None], [303, "111"]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(pdf.find_data_type_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            f_or_d = "directory" if attr == "csv" else "file"
            write_func, write_kwargs = utils._get_write_function_and_kwargs(
                pdf, path, f_or_d)
            write_func(dat, path, **write_kwargs)
            dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d,
                                          False)
            self.assertTrue(
                pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
Пример #3
0
    def testXToManyTwo(self):
        input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]],
                                     child_one=[["F1", "F2", "F3"], []],
                                     child_two=[["F1", "F2"], ["F3"]],
                                     child_three=[[], ["F1", "F2", "F3"]])
        for t in ["child_one", "child_two", "child_three"]:
            input_schema.add_foreign_key(t, "parent",
                                         [["F1"] * 2, ["F2"] * 2, ["F3"] * 2])
        self.assertTrue({fk.cardinality
                         for fk in input_schema.foreign_keys} ==
                        {"one-to-one", "many-to-one"})

        rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]]
        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(parent=rows,
                         child_one=rows,
                         child_two=rows,
                         child_three=rows)
        self.assertTrue(
            all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables))
        orig_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.child_one[1, 2, 4] = {}
        dat.child_two[1, 2.2] = 3
        dat.child_three.append([1, 2, 4])
        new_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        self.assertTrue(len(fk_fails) == 3)
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))

        input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]],
                                     child_one=[["F1", "F2", "F3"], []],
                                     child_two=[["F1", "F2"], ["F3"]],
                                     child_three=[[], ["F1", "F2", "F3"]])
        for t in ["child_one", "child_two", "child_three"]:
            input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2])
        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(parent=rows,
                         child_one=rows,
                         child_two=rows,
                         child_three=rows)
        self.assertTrue(
            all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables))
        orig_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.child_one[1, 2, 4] = {}
        dat.child_two[1, 2.2] = 4
        dat.child_three.append([1, 2, 4])
        new_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertTrue(
            len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3)
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
Пример #4
0
    def test_datetime(self):
        core_path = os.path.join(_scratchDir, "parameters")
        pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        pdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        pdf.add_parameter("p2", None, datetime=True, nullable=True)
        pdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        pdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)
        dat = TicDatFactory(**pdf.schema()).TicDat(
            table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None],
                               [
                                   datetime.datetime.now(),
                                   dateutil.parser.parse("Sept 11 2011")
                               ]],
            parameters=[["p1", "7/11/1911"], ["p2", None]])
        dat = TicDatFactory(**pdf.schema()).copy_to_pandas(
            dat, drop_pk_columns=False)
        self.assertFalse(
            pdf.find_data_type_failures(dat)
            or pdf.find_data_row_failures(dat))

        for attr, path in [["csv", core_path + "_csv"],
                           ["xls", core_path + ".xlsx"],
                           ["sql", core_path + ".db"],
                           ["json", core_path + ".json"]]:
            func = "write_directory" if attr == "csv" else "write_file"
            getattr(getattr(pdf, attr), func)(dat, path)
            dat_1 = getattr(pdf, attr).create_pan_dat(path)
            self.assertFalse(pdf._same_data(dat, dat_1))
            self.assertFalse(
                pdf.find_data_type_failures(dat_1)
                or pdf.find_data_row_failures(dat_1))
            dat_1 = pdf.copy_to_tic_dat(dat_1)
            self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'})
            self.assertTrue(
                isinstance(dat_1.parameters["p1"]["b"],
                           (datetime.datetime, numpy.datetime64))
                and not pd.isnull(dat_1.parameters["p1"]["b"]))
            self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"]))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime,
                                   numpy.datetime64)) and not pd.isnull(_)
                    for _ in dat_1.table_with_stuffs))
            self.assertTrue(
                all(
                    isinstance(_, (datetime.datetime, numpy.datetime64))
                    or _ is None or utils.safe_apply(math.isnan)(_)
                    for v in dat_1.table_with_stuffs.values()
                    for _ in v.values()))
            self.assertTrue({
                pd.isnull(_)
                for v in dat_1.table_with_stuffs.values() for _ in v.values()
            } == {True, False})
Пример #5
0
 def testDietWithInfFlagging(self):
     diet_pdf = PanDatFactory(**dietSchema())
     addDietDataTypes(diet_pdf)
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()),
                              drop_pk_columns=False)
     diet_pdf.set_infinity_io_flag(999999999)
     core_path = os.path.join(_scratchDir, "diet_with_inf_flagging")
     diet_pdf.sql.write_file(dat, core_path + ".db")
     diet_pdf.csv.write_directory(dat, core_path + "_csv")
     diet_pdf.json.write_file(dat, core_path + ".json")
     diet_pdf.xls.write_file(dat, core_path + ".xlsx")
     for attr, f in [["sql", core_path + ".db"],
                     ["csv", core_path + "_csv"],
                     ["json", core_path + ".json"],
                     ["xls", core_path + ".xlsx"]]:
         dat_1 = getattr(diet_pdf, attr).create_pan_dat(f)
         self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = diet_pdf.clone()
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = PanDatFactory(**diet_pdf.schema())
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5))
         protein = dat_1.categories["name"] == "protein"
         self.assertTrue(
             list(dat_1.categories[protein]["maxNutrition"])[0] ==
             999999999)
         dat_1.categories.loc[protein, "maxNutrition"] = float("inf")
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
Пример #6
0
 def make_pdf():
     pdf = PanDatFactory(data_table = [["a"], ["b", "c"]],
                         parameters = [["a"], ["b"]])
     pdf.add_parameter("Something", 100, max=100, inclusive_max=True)
     pdf.add_parameter("Another thing", 5, must_be_int=True)
     pdf.add_parameter("Untyped thing", "whatever", enforce_type_rules=False)
     pdf.add_parameter("Last", 'boo', number_allowed=False, strings_allowed='*')
     return PanDatFactory.create_from_full_schema(pdf.schema(True))
Пример #7
0
    def testIssue45(self):
        pdf = PanDatFactory(data=[["a"], ["b"]])
        tdf = TicDatFactory(**pdf.schema())
        dat_nums = tdf.copy_to_pandas(
            tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]), drop_pk_columns=False)
        dat_strs = tdf.copy_to_pandas(
            tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]),
            drop_pk_columns=False)
        files = [
            os.path.join(_scratchDir, _)
            for _ in ["dat_nums.xlsx", "dat_strs.xlsx"]
        ]
        pdf.xls.write_file(dat_nums, files[0])
        pdf.xls.write_file(dat_strs, files[1])
        dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files]
        self.assertTrue(pdf._same_data(dat_nums, dat_nums_2))
        # this is pandas pushing things to be numeric
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertTrue(pdf._same_data(dat_nums, dat_strs_2))

        pdf = PanDatFactory(data=[["a"], ["b"]])
        pdf.set_data_type("data",
                          "a",
                          number_allowed=False,
                          strings_allowed='*')
        dat_mixed = tdf.copy_to_pandas(
            tdf.TicDat(data=[["1", 2], ["3", 4], ["022", 44]]),
            drop_pk_columns=False)
        dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files]
        self.assertFalse(pdf._same_data(dat_nums, dat_nums_2))
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertFalse(pdf._same_data(dat_nums_2, dat_mixed))
        self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))

        pdf = PanDatFactory(data=[["a"], ["b"]])
        csv_dirs = [
            os.path.join(_scratchDir, _)
            for _ in ["dat_nums_csv", "dat_strs_csv"]
        ]
        pdf.csv.write_directory(dat_nums, csv_dirs[0])
        pdf.csv.write_directory(dat_strs, csv_dirs[1])
        dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs]
        self.assertTrue(pdf._same_data(dat_nums, dat_nums_2))
        # this is pandas pushing things to be numeric
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertTrue(pdf._same_data(dat_nums, dat_strs_2))
        pdf = PanDatFactory(data=[["a"], ["b"]])
        pdf.set_data_type("data",
                          "a",
                          number_allowed=False,
                          strings_allowed='*')
        dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs]
        self.assertFalse(pdf._same_data(dat_nums, dat_nums_2))
        self.assertFalse(pdf._same_data(dat_strs, dat_strs_2))
        self.assertFalse(pdf._same_data(dat_nums_2, dat_strs_2))
        self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))
Пример #8
0
    def testAdditionalFKs(self):
        pdf = PanDatFactory(pt1=[["F1"], []],
                            pt2=[["F2"], []],
                            pt3=[["F1", "F2"], []],
                            pt4=[["F1"], ["F2"]],
                            pt5=[[], ["F1", "F2"]])
        for c in ["pt3", "pt4", "pt5"]:
            pdf.add_foreign_key(c, "pt1", ["F1", "F1"])
            pdf.add_foreign_key(c, "pt2", ["F2", "F2"])
        tdf = TicDatFactory(**pdf.schema())

        def pan_dat_(_):
            rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _))
            self.assertFalse(pdf.find_duplicates(rtn))
            return rtn

        ticDat = tdf.TicDat(pt1=[1, 2, 3, 4], pt2=[5, 6, 7, 8])
        for f1, f2 in itertools.product(range(1, 5), range(5, 9)):
            ticDat.pt3[f1, f2] = {}
            ticDat.pt4[f1] = f2
            ticDat.pt5.append((f1, f2))
        origDat = tdf.copy_tic_dat(ticDat, freeze_it=True)
        self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(origDat)))
        ticDat.pt3["no", 6] = ticDat.pt3[1, "no"] = {}
        ticDat.pt4["no"] = 6
        ticDat.pt4["nono"] = 6.01
        panDat = pan_dat_(ticDat)
        fails1 = pdf.find_foreign_key_failures(panDat)
        self.assertTrue(fails1)
        pdf.remove_foreign_key_failures(panDat)
        self.assertFalse(pdf.find_foreign_key_failures(panDat))
        self.assertTrue(pdf._same_data(panDat, pan_dat_(origDat)))

        orig_lens = {t: len(getattr(origDat, t)) for t in tdf.all_tables}
        ticDat.pt3["no", 6] = ticDat.pt3[1, "no"] = {}
        ticDat.pt4["no"] = 6
        ticDat.pt4["nono"] = 6.01
        ticDat.pt5.append(("no", 6))
        ticDat.pt5.append((1, "no"))
        panDat = pan_dat_(ticDat)
        fails2 = pdf.find_foreign_key_failures(panDat)
        self.assertTrue(
            set(fails1) != set(fails2) and set(fails1).issubset(fails2))
        pdf.remove_foreign_key_failures(panDat)
        self.assertFalse(pdf.find_foreign_key_failures(panDat))
        self.assertTrue({t: len(getattr(panDat, t))
                         for t in tdf.all_tables} == orig_lens)
Пример #9
0
 def test_missing_tables(self):
     core_path = os.path.join(_scratchDir, "missing_tables")
     pdf_1 = PanDatFactory(this=[["Something"], ["Another"]])
     pdf_2 = PanDatFactory(
         **dict(pdf_1.schema(), that=[["What", "Ever"], []]))
     dat = pdf_1.PanDat(this={
         "Something": ["a", "b", "c"],
         "Another": [2, 3, 5]
     })
     for attr, path in [["sql", core_path + ".db"],
                        ["csv", core_path + "_csv"],
                        ["json", core_path + ".json"],
                        ["xls", core_path + ".xlsx"]]:
         func = "write_directory" if attr == "csv" else "write_file"
         getattr(getattr(pdf_1, attr), func)(dat, path)
         dat_1 = getattr(pdf_2, attr).create_pan_dat(path)
         self.assertTrue(pdf_1._same_data(dat, dat_1))
Пример #10
0
    def testBasicFKs(self):
        for cloning in [True, False, "*"]:
            clone_me_maybe = lambda x : x.clone(tdf.all_tables if cloning == "*" else None) if cloning else x

            pdf = PanDatFactory(plants = [["name"], ["stuff", "otherstuff"]],
                                lines = [["name"], ["plant", "weird stuff"]],
                                line_descriptor = [["name"], ["booger"]],
                                products = [["name"],["gover"]],
                                production = [["line", "product"], ["min", "max"]],
                                pureTestingTable = [[], ["line", "plant", "product", "something"]],
                                extraProduction = [["line", "product"], ["extramin", "extramax"]],
                                weirdProduction = [["line1", "line2", "product"], ["weirdmin", "weirdmax"]])
            pdf.add_foreign_key("production", "lines", ("line", "name"))
            pdf.add_foreign_key("production", "products", ("product", "name"))
            pdf.add_foreign_key("lines", "plants", ("plant", "name"))
            pdf.add_foreign_key("line_descriptor", "lines", ("name", "name"))
            for f in set(pdf.data_fields["pureTestingTable"]).difference({"something"}):
                pdf.add_foreign_key("pureTestingTable", "%ss"%f, (f,"name"))
            pdf.add_foreign_key("extraProduction", "production", (("line", "line"), ("product","product")))
            pdf.add_foreign_key("weirdProduction", "production", (("line1", "line"), ("product","product")))
            pdf.add_foreign_key("weirdProduction", "extraProduction", (("line2","line"), ("product","product")))
            self._testPdfReproduction(pdf)
            pdf = clone_me_maybe(pdf)

            tdf = TicDatFactory(**pdf.schema())
            goodDat = tdf.TicDat()
            goodDat.plants["Cleveland"] = ["this", "that"]
            goodDat.plants["Newark"]["otherstuff"] =1
            goodDat.products["widgets"] = goodDat.products["gadgets"] = "shizzle"

            for i,p in enumerate(goodDat.plants):
                goodDat.lines[i]["plant"] = p

            for i,(pl, pd) in enumerate(itertools.product(goodDat.lines, goodDat.products)):
                goodDat.production[pl, pd] = {"min":1, "max":10+i}

            badDat1 = tdf.copy_tic_dat(goodDat)
            badDat1.production["notaline", "widgets"] = [0,1]
            badDat2 = tdf.copy_tic_dat(badDat1)


            def pan_dat_(_):
                rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _))
                self.assertFalse(pdf.find_duplicates(rtn))
                return rtn
            fk, fkm = ForeignKey, ForeignKeyMapping
            fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1))
            fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2))

            self.assertTrue(set(fk_fails1) == set(fk_fails2) ==
                            {fk('production', 'lines', fkm('line', 'name'), 'many-to-one')})
            self.assertTrue(set(pdf.find_foreign_key_failures(pan_dat_(badDat1), verbosity="Low")) ==
                            set(pdf.find_foreign_key_failures(pan_dat_(badDat2), verbosity="Low")) ==
                             {('production', 'lines', ('line', 'name'))})
            for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]:
                self.assertTrue(set(row_fails["line"]) == {"notaline"} and set(row_fails["product"]) == {"widgets"})

            badDat1.lines["notaline"]["plant"] = badDat2.lines["notaline"]["plant"] = "notnewark"
            fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1))
            fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2))
            self.assertTrue(set(fk_fails1) == set(fk_fails2) ==
                            {fk('lines', 'plants', fkm('plant', 'name'), 'many-to-one')})
            for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]:
                self.assertTrue(set(row_fails["name"]) == {"notaline"} and set(row_fails["plant"]) == {"notnewark"})


            for bad in [badDat1, badDat2]:
                bad_pan = pdf.remove_foreign_key_failures(pan_dat_(bad))
                self.assertFalse(pdf.find_foreign_key_failures(bad_pan))
                self.assertTrue(pdf._same_data(bad_pan, pan_dat_(goodDat)))


            _ = len(goodDat.lines)
            for i,p in enumerate(list(goodDat.plants.keys()) + list(goodDat.plants.keys())):
                goodDat.lines[i+_]["plant"] = p
            for l in goodDat.lines:
                if i%2:
                    goodDat.line_descriptor[l] = i+10

            for i,(l,pl,pdct) in enumerate(sorted(itertools.product(goodDat.lines, goodDat.plants, goodDat.products))):
                goodDat.pureTestingTable.append((l,pl,pdct,i))
            self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(goodDat)))
            badDat = tdf.copy_tic_dat(goodDat)
            badDat.pureTestingTable.append(("j", "u", "nk", "ay"))
            fk_fails = pdf.find_foreign_key_failures(pan_dat_(badDat))
            self.assertTrue(set(fk_fails) ==
                {fk('pureTestingTable', 'plants', fkm('plant', 'name'), 'many-to-one'),
                 fk('pureTestingTable', 'products', fkm('product', 'name'), 'many-to-one'),
                 fk('pureTestingTable', 'lines', fkm('line', 'name'), 'many-to-one')})

            for df in fk_fails.values():
                df = df.T
                c = df.columns[0]
                self.assertTrue({'ay', 'j', 'nk', 'u'} == set(df[c]))
Пример #11
0
    def testXToMany(self):
        input_schema = PanDatFactory (roster = [["Name"],["Grade", "Arrival Inning", "Departure Inning",
                                                          "Min Innings Played", "Max Innings Played"]],
                                      positions = [["Position"],["Position Importance", "Position Group",
                                                                 "Consecutive Innings Only"]],
                                      innings = [["Inning"],["Inning Group"]],
                                      position_constraints = [["Position Group", "Inning Group", "Grade"],
                                                              ["Min Players", "Max Players"]])
        input_schema.add_foreign_key("position_constraints", "roster", ["Grade", "Grade"])
        input_schema.add_foreign_key("position_constraints", "positions", ["Position Group", "Position Group"])
        input_schema.add_foreign_key("position_constraints", "innings", ["Inning Group", "Inning Group"])

        self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"many-to-many"})

        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat()
        for i,p in enumerate(["bob", "joe", "fred", "alice", "lisa", "joean", "ginny"]):
            dat.roster[p]["Grade"] = (i%3)+1
        dat.roster["dummy"]["Grade"]  = "whatevers"
        for i,p in enumerate(["pitcher", "catcher", "1b", "2b", "ss", "3b", "lf", "cf", "rf"]):
            dat.positions[p]["Position Group"] = "PG %s"%((i%4)+1)
        for i in range(1, 10):
            dat.innings[i]["Inning Group"] = "before stretch" if i < 7 else "after stretch"
        dat.innings[0] ={}
        for pg, ig, g in itertools.product(["PG %s"%i for i in range(1,5)], ["before stretch", "after stretch"],
                                           [1, 2, 3]):
            dat.position_constraints[pg, ig, g] = {}

        orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))

        dat.position_constraints["no", "no", "no"] = dat.position_constraints[1, 2, 3] = {}
        new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema._same_data(orig_pan_dat, new_pan_dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        fk_fails_2 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low")
        fk_fails_3 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low", as_table=False)
        self.assertTrue({tuple(k)[:2] + (tuple(k[2]),): len(v) for k,v in fk_fails.items()} ==
                        {k:len(v) for k,v in fk_fails_2.items()} ==
                        {k:v.count(True) for k,v in fk_fails_3.items()} ==
                        {('position_constraints', 'innings', ("Inning Group", "Inning Group")): 2,
                         ('position_constraints', 'positions', ("Position Group", "Position Group")): 2,
                         ('position_constraints', 'roster', ("Grade", "Grade")): 1})
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))

        input_schema = PanDatFactory(table_one=[["One", "Two"], []],
                                     table_two=[["One"], ["Two"]])
        input_schema.add_foreign_key("table_two", "table_one", ["One", "One"])
        self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-many"})

        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(table_one = [[1,2], [3,4], [5,6], [7,8]], table_two = {1:2, 3:4, 5:6})

        orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.table_two[9]=10
        new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        self.assertTrue({tuple(k)[:2]:len(v) for k,v in fk_fails.items()} == {('table_two', 'table_one'): 1})
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
Пример #12
0
    def testRoundTrips(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        tdf.enable_foreign_key_links()
        oldDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))
        pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False)
        self.assertTrue(pdf.good_pan_dat_object(pan_dat))
        tic_dat = pdf.copy_to_tic_dat(pan_dat)
        self.assertTrue(tdf._same_data(oldDat, tic_dat))

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        oldDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        pdf = PanDatFactory.create_from_full_schema(
            tdf.schema(include_ancillary_info=True))
        pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False)
        self.assertTrue(pdf.good_pan_dat_object(pan_dat))
        tic_dat = pdf.copy_to_tic_dat(pan_dat)
        self.assertTrue(tdf._same_data(oldDat, tic_dat))

        pdf = PanDatFactory(table=[["a", "b"], ["c"]])
        pan_dat = pdf.PanDat(table=utils.DataFrame({
            "a": [1, 2, 1, 1],
            "b": [10, 10, 10, 11],
            "c": [101, 102, 103, 104]
        }))
        self.assertTrue(
            len(pdf.find_duplicates(pan_dat, keep=False)["table"]) == 2)
        tic_dat = pdf.copy_to_tic_dat(pan_dat)
        self.assertTrue(len(tic_dat.table) == len(pan_dat.table) - 1)

        tdf = TicDatFactory(**pdf.schema())
        tic_dat = tdf.TicDat(table=[[1, 2, 3], [None, 2, 3], [2, 1, None]])
        self.assertTrue(len(tic_dat.table) == 3)
        tic_dat_two = pdf.copy_to_tic_dat(
            tdf.copy_to_pandas(tic_dat, drop_pk_columns=False))
        self.assertFalse(tdf._same_data(tic_dat, tic_dat_two))
        tic_dat3 = tdf.TicDat(
            table=[[1, 2, 3], [float("nan"), 2, 3], [2, 1, float("nan")]])
        # this fails because _same_data isn't smart enough to check against nan in the keys,
        # because float("nan") != float("nan")
        self.assertFalse(tdf._same_data(tic_dat3, tic_dat_two))

        pdf = PanDatFactory(table=[["a"], ["b", "c"]])
        tdf = TicDatFactory(**pdf.schema())
        tic_dat = tdf.TicDat(table=[[1, 2, 3], [2, None, 3], [2, 1, None]])
        tic_dat_two = pdf.copy_to_tic_dat(
            tdf.copy_to_pandas(tic_dat, drop_pk_columns=False))
        self.assertFalse(tdf._same_data(tic_dat, tic_dat_two))
        tic_dat3 = tdf.TicDat(
            table=[[1, 2, 3], [2, float("nan"), 3], [2, 1, float("nan")]])
        # _same_data works fine in checking nan equivalence in data rows - which maybe
        self.assertTrue(
            tdf._same_data(tic_dat3,
                           tic_dat_two,
                           nans_are_same_for_data_rows=True))
Пример #13
0
    def testDefaultAdd(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        xlsFilePath = os.path.join(_scratchDir, "diet_add.xlsx")
        pdf.xls.write_file(panDat, xlsFilePath)
        sqlFilePath = os.path.join(_scratchDir, "diet_add.sql")
        pdf.sql.write_file(panDat, sqlFilePath)
        csvDirPath = os.path.join(_scratchDir, "diet_add_csv")
        pdf.csv.write_directory(panDat,
                                csvDirPath,
                                case_space_table_names=True)

        pdf2 = PanDatFactory(
            **{
                k: [p, d] if k != "foods" else [p, list(d) + ["extra"]]
                for k, (p, d) in dietSchema().items()
            })
        ex = self.firesException(lambda: pdf2.xls.create_pan_dat(xlsFilePath))
        self.assertTrue("missing" in ex and "extra" in ex)
        ex = self.firesException(lambda: pdf2.sql.create_pan_dat(sqlFilePath))
        self.assertTrue("missing" in ex and "extra" in ex)
        ex = self.firesException(lambda: pdf2.csv.create_pan_dat(csvDirPath))
        self.assertTrue("missing" in ex and "extra" in ex)
        ex = self.firesException(
            lambda: pdf2.json.create_pan_dat(pdf.json.write_file(panDat, "")))
        self.assertTrue("missing" in ex and "extra" in ex)

        panDat2 = pdf2.sql.create_pan_dat(sqlFilePath,
                                          fill_missing_fields=True)
        self.assertTrue(set(panDat2.foods["extra"]) == {0})
        panDat2.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        panDat2 = pdf2.xls.create_pan_dat(xlsFilePath,
                                          fill_missing_fields=True)
        self.assertTrue(set(panDat2.foods["extra"]) == {0})
        panDat2.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        panDat2 = pdf2.csv.create_pan_dat(csvDirPath, fill_missing_fields=True)
        self.assertTrue(set(panDat2.foods["extra"]) == {0})
        panDat2.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        panDat2 = pdf2.json.create_pan_dat(pdf.json.write_file(panDat, ""),
                                           fill_missing_fields=True)
        self.assertTrue(set(panDat2.foods["extra"]) == {0})
        panDat2.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat2, epsilon=1e-5))

        pdf3 = PanDatFactory(**pdf2.schema())
        pdf3.set_default_value("foods", "extra", 13)
        panDat3 = pdf3.sql.create_pan_dat(sqlFilePath,
                                          fill_missing_fields=True)
        self.assertTrue(set(panDat3.foods["extra"]) == {13})
        panDat3.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat3))

        panDat3 = pdf3.xls.create_pan_dat(xlsFilePath,
                                          fill_missing_fields=True)
        self.assertTrue(set(panDat3.foods["extra"]) == {13})
        panDat3.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat3))

        panDat3 = pdf3.csv.create_pan_dat(csvDirPath, fill_missing_fields=True)
        self.assertTrue(set(panDat3.foods["extra"]) == {13})
        panDat3.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat3))

        panDat3 = pdf3.json.create_pan_dat(pdf.json.write_file(panDat, ""),
                                           fill_missing_fields=True)
        self.assertTrue(set(panDat3.foods["extra"]) == {13})
        panDat3.foods.drop("extra", axis=1, inplace=True)
        self.assertTrue(pdf._same_data(panDat, panDat3, epsilon=1e-5))