示例#1
0
    def _test_generic_copy(self, ticDat, tdf, skip_tables=None):
        assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables)
        path = makeCleanDir(os.path.join(_scratchDir, "generic_copy"))
        replace_name = lambda f: "name_" if f == "name" else f
        clean_tdf = TicDatFactory(
            **{
                t: [list(map(replace_name, pks)), dfs]
                for t, (pks, dfs) in tdf.schema().items()
            })

        temp_tdf = TicDatFactory(
            **{
                t: v if t in (skip_tables or []) else '*'
                for t, v in clean_tdf.schema().items()
            })
        temp_dat = temp_tdf.TicDat(
            **{t: getattr(ticDat, t)
               for t in (skip_tables or [])})
        for t in temp_tdf.generic_tables:
            setattr(
                temp_dat, t,
                getattr(
                    clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False),
                    t))

        temp_tdf.sql.write_db_data(temp_dat, os.path.join(path, "f.db"))
        temp_tdf.sql.write_sql_file(temp_dat,
                                    os.path.join(path, "f1.sql"),
                                    include_schema=False)
        temp_tdf.sql.write_sql_file(temp_dat,
                                    os.path.join(path, "f2.sql"),
                                    include_schema=True)

        for file_name, includes_schema in [("f.db", False), ("f1.sql", False),
                                           ("f2.sql", True)]:
            file_path = os.path.join(path, file_name)
            if file_path.endswith(".db"):
                self.assertFalse(temp_tdf.sql.find_duplicates(file_path))
                read_dat = temp_tdf.sql.create_tic_dat(file_path)
            else:
                read_dat = temp_tdf.sql.create_tic_dat_from_sql(
                    file_path, includes_schema)
            generic_free_dat, _ = utils.create_generic_free(read_dat, temp_tdf)
            check_dat = clean_tdf.TicDat()
            for t in temp_tdf.generic_tables:
                for r in getattr(generic_free_dat, t):
                    pks = clean_tdf.primary_key_fields[t]
                    getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \
                        {df:r[df] for df in clean_tdf.data_fields.get(t, [])}
            for t in (skip_tables or []):
                for k, v in getattr(generic_free_dat, t).items():
                    getattr(check_dat, t)[k] = v
            self.assertTrue(
                clean_tdf._same_data(check_dat,
                                     clean_tdf.copy_tic_dat(ticDat)))
示例#2
0
    def testDiet(self):
        if not self.can_run:
            return
        for verbose in [True, False]:
            tdf = TicDatFactory(**dietSchema())
            ticDat = tdf.freeze_me(
                tdf.TicDat(**{
                    t: getattr(dietData(), t)
                    for t in tdf.primary_key_fields
                }))
            writePath = os.path.join(
                makeCleanDir(os.path.join(_scratchDir, "diet")), "file.json")
            tdf.json.write_file(ticDat, writePath, verbose=verbose)
            self.assertFalse(tdf.json.find_duplicates(writePath))
            jsonTicDat = tdf.json.create_tic_dat(writePath)
            self.assertTrue(tdf._same_data(ticDat, jsonTicDat))

            def change():
                jsonTicDat.categories["calories"]["minNutrition"] = 12

            self.assertFalse(firesException(change))
            self.assertFalse(tdf._same_data(ticDat, jsonTicDat))
            jsonTicDat = tdf.json.create_tic_dat(writePath, freeze_it=True)
            self.assertTrue(firesException(change))
            self.assertTrue(tdf._same_data(ticDat, jsonTicDat))

        tdf2 = TicDatFactory(**dietSchemaWeirdCase())
        dat2 = copyDataDietWeirdCase(ticDat)
        tdf2.json.write_file(dat2,
                             writePath,
                             allow_overwrite=True,
                             verbose=verbose)
        jsonTicDat2 = tdf.json.create_tic_dat(writePath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, jsonTicDat2))

        tdf3 = TicDatFactory(**dietSchemaWeirdCase2())
        dat3 = copyDataDietWeirdCase2(ticDat)
        tdf3.json.write_file(dat3,
                             writePath,
                             allow_overwrite=True,
                             verbose=verbose)
        with open(writePath, "r") as f:
            jdict = json.load(f)
        jdict["nutrition quantities"] = jdict["nutrition_quantities"]
        del (jdict["nutrition_quantities"])
        with open(writePath, "w") as f:
            json.dump(jdict, f)
        jsonDat3 = tdf3.json.create_tic_dat(writePath)
        self.assertTrue(tdf3._same_data(dat3, jsonDat3))
        jdict["nutrition_quantities"] = jdict["nutrition quantities"]
        with open(writePath, "w") as f:
            json.dump(jdict, f)
        self.assertTrue(
            self.firesException(lambda: tdf3.json.create_tic_dat(writePath)))
示例#3
0
    def testSilly(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**sillyMeSchema())
        ticDat = tdf.TicDat(**sillyMeData())
        schema2 = sillyMeSchema()
        schema2["b"][0] = ("bField2", "bField1", "bField3")
        schema3 = sillyMeSchema()
        schema3["a"][1] = ("aData2", "aData3", "aData1")
        schema4 = sillyMeSchema()
        schema4["a"][1] = ("aData1", "aData3")
        schema5 = sillyMeSchema()
        _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,)
        for t in ("a", "b") :
            schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0])
        schema5["a"][0], schema5["b"][0] =  (),  []
        schema6 = sillyMeSchema()
        schema6["d"] =  [["dField"],()]

        tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6))
        tdf5.set_generator_tables(("a","c"))
        tdf5 = tdf5.clone()
        filePath = os.path.join(_scratchDir, "silly.db")
        tdf.sql.write_db_data(ticDat, filePath)
        self.assertFalse(tdf.sql.find_duplicates(filePath))

        ticDat2 = tdf2.sql.create_tic_dat(filePath)
        self.assertFalse(tdf._same_data(ticDat, ticDat2))

        ticDat3 = tdf3.sql.create_tic_dat(filePath)
        self.assertTrue(tdf._same_data(ticDat, ticDat3))

        ticDat4 = tdf4.sql.create_tic_dat(filePath)
        for t in ["a","b"]:
            for k,v in getattr(ticDat4, t).items() :
                for _k, _v in v.items() :
                    self.assertTrue(getattr(ticDat, t)[k][_k] == _v)
                if set(v) == set(getattr(ticDat, t)[k]) :
                    self.assertTrue(t == "b")
                else :
                    self.assertTrue(t == "a")

        ticDat5 = tdf5.sql.create_tic_dat(filePath)
        self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5))
        self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b))

        self.assertTrue("table d" in self.firesException(lambda  : tdf6.sql.create_tic_dat(filePath)))
        ticDat.a["theboger"] = (1, None, 12)
        if am_on_windows:
            filePath = filePath.replace("silly.db", "silly_2.db") # working around issue opalytics/opalytics-ticdat#153
        tdf.sql.write_db_data(ticDat, makeCleanPath(filePath))
        ticDatNone = tdf.sql.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, ticDatNone))
        self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
示例#4
0
    def testMissingTable(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData()))
        inputset = create_inputset_mock(tdf, ticDat)

        tdf2 = TicDatFactory(**(dict(dietSchema(), missing_table=[["a"],["b"]])))
        ticDat2 = tdf2.opalytics.create_tic_dat(inputset)
        self.assertTrue(tdf._same_data(ticDat, ticDat2))
        self.assertFalse(ticDat2.missing_table)
示例#5
0
 def testDups(self):
     if not self.can_run:
         return
     for hack, raw_data in list(product(*(([True, False],)*2))):
         tdf = TicDatFactory(one = [["a"],["b", "c"]],
                             two = [["a", "b"],["c"]],
                             three = [["a", "b", "c"],[]])
         tdf2 = TicDatFactory(**{t:[[],["a", "b", "c"]] for t in tdf.all_tables})
         td = tdf2.TicDat(**{t:[[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]]
                             for t in tdf.all_tables})
         dups = tdf.opalytics.find_duplicates(create_inputset_mock(tdf2, td, hack), raw_data=raw_data)
         self.assertTrue(dups == {'three': {(1, 2, 2): 2}, 'two': {(1, 2): 3}, 'one': {1: 3, 2: 2}})
示例#6
0
 def test_missing_tables(self):
     path = os.path.join(_scratchDir, "missing")
     tdf_1 = TicDatFactory(this=[["Something"], ["Another"]])
     tdf_2 = TicDatFactory(
         **dict(tdf_1.schema(), that=[["What", "Ever"], []]))
     dat = tdf_1.TicDat(this=[["a", 2], ["b", 3], ["c", 5]])
     tdf_1.sql.write_sql_file(dat, path + ".sql")
     sql_dat = tdf_2.sql.create_tic_dat_from_sql(path + ".sql")
     self.assertTrue(tdf_1._same_data(dat, sql_dat))
     tdf_1.sql.write_db_data(dat, path + ".db")
     sql_dat = tdf_2.sql.create_tic_dat(path + ".db")
     self.assertTrue(tdf_1._same_data(dat, sql_dat))
示例#7
0
    def testBooleansAndNulls(self):
        tdf = TicDatFactory(table=[["field one"], ["field two"]])
        dat = tdf.TicDat(table=[[None, 100], [200, True], [False, 300],
                                [300, None], [400, False]])
        file_one = os.path.join(_scratchDir, "boolDefaults_1.json")
        file_two = os.path.join(_scratchDir, "boolDefaults_2.json")
        tdf.json.write_file(dat, file_one, verbose=True)
        tdf.json.write_file(dat, file_two, verbose=False)
        dat_1 = tdf.json.create_tic_dat(file_one)
        dat_2 = tdf.json.create_tic_dat(file_two)
        self.assertTrue(tdf._same_data(dat, dat_1))
        self.assertTrue(tdf._same_data(dat, dat_2))

        tdf = TicDatFactory(table=[["field one"], ["field two"]])
        for f in ["field one", "field two"]:
            tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True)
        tdf.set_infinity_io_flag(None)
        dat_inf = tdf.TicDat(
            table=[[float("inf"), 100], [200, True], [False, 300],
                   [300, float("inf")], [400, False]])
        dat_1 = tdf.json.create_tic_dat(file_one)
        dat_2 = tdf.json.create_tic_dat(file_two)
        self.assertTrue(tdf._same_data(dat_inf, dat_1))
        self.assertTrue(tdf._same_data(dat_inf, dat_2))
        tdf.json.write_file(dat_inf,
                            file_one,
                            verbose=True,
                            allow_overwrite=True)
        tdf.json.write_file(dat_inf,
                            file_two,
                            verbose=False,
                            allow_overwrite=True)
        dat_1 = tdf.json.create_tic_dat(file_one)
        dat_2 = tdf.json.create_tic_dat(file_two)
        self.assertTrue(tdf._same_data(dat_inf, dat_1))
        self.assertTrue(tdf._same_data(dat_inf, dat_2))

        tdf = TicDatFactory(table=[["field one"], ["field two"]])
        for f in ["field one", "field two"]:
            tdf.set_data_type("table",
                              f,
                              min=-float("inf"),
                              inclusive_min=True)
        tdf.set_infinity_io_flag(None)
        dat_1 = tdf.json.create_tic_dat(file_one)
        dat_2 = tdf.json.create_tic_dat(file_two)
        self.assertFalse(tdf._same_data(dat_inf, dat_1))
        self.assertFalse(tdf._same_data(dat_inf, dat_2))
        dat_inf = tdf.TicDat(
            table=[[float("-inf"), 100], [200, True], [False, 300],
                   [300, -float("inf")], [400, False]])
        self.assertTrue(tdf._same_data(dat_inf, dat_1))
        self.assertTrue(tdf._same_data(dat_inf, dat_2))
示例#8
0
    def testDataTypes(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [10,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5
        ticdat.nutritionQuantities['a', 2] = 12

        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**dietSchema())
        pdf.set_data_type("foods", "cost", nullable=False)
        pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
        self.assertTrue(set({(v["food"], v["category"])
                             for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                            {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})

        failed = pdf.find_data_type_failures(pandat, as_table=False)
        self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed="*")
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"])
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
示例#9
0
 def testDups(self):
     if not self.can_run:
         return
     tdf = TicDatFactory(one = [["a"],["b", "c"]],
                         two = [["a", "b"],["c"]],
                         three = [["a", "b", "c"],[]])
     tdf2 = TicDatFactory(**{t:[[],["a", "b", "c"]] for t in tdf.all_tables})
     td = tdf2.TicDat(**{t:[[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]]
                         for t in tdf.all_tables})
     f = makeCleanPath(os.path.join(_scratchDir, "testDups.db"))
     tdf2.sql.write_db_data(td, f)
     dups = tdf.sql.find_duplicates(f)
     self.assertTrue(dups ==  {'three': {(1, 2, 2): 2}, 'two': {(1, 2): 3}, 'one': {1: 3, 2: 2}})
示例#10
0
    def testIntHandling(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(boger=[["the"], ["big", "boger"]],
                            moger=[["the", "big"], ["boger"]],
                            woger=[[], ["the", "big", "boger"]])
        for t in ["boger", "moger", "woger"]:
            tdf.set_data_type(t, "big", must_be_int=True)
        dat = tdf.TicDat(boger={
            1: [1.0, "t"],
            "b": [12, 11.1],
            12.1: [14.0, 15.0]
        },
                         moger={
                             (1, 1.0): "t",
                             ("b", 12): 11.1,
                             (12.1, 14.0): 15.0
                         },
                         woger=[(1, 1.0, "t"), ("b", 12, 11.1),
                                (12.1, 14.0, 15.0)])
        filePath = os.path.join(_scratchDir, "intHandling.xls")
        tdf.xls.write_file(dat, filePath)
        dat2 = tdf.xls.create_tic_dat(filePath)

        tdf3 = TicDatFactory(boger=[["the"], ["big", "boger"]],
                             moger=[["the", "big"], ["boger"]],
                             woger=[[], ["the", "big", "boger"]])
        dat3 = tdf3.xls.create_tic_dat(filePath)
        self.assertFalse(
            any(map(tdf.find_data_type_failures, [dat, dat2, dat3])))
        self.assertTrue(all(tdf._same_data(dat, _) for _ in [dat2, dat3]))

        self.assertFalse(
            all(
                isinstance(r["big"], int)
                for r in list(dat.boger.values()) + list(dat.woger)))
        self.assertTrue(
            all(
                isinstance(r["big"], int)
                for r in list(dat2.boger.values()) + list(dat2.woger)))
        self.assertFalse(
            any(
                isinstance(r["big"], int)
                for r in list(dat3.boger.values()) + list(dat3.woger)))
        self.assertTrue(
            all(isinstance(_.woger[1]["big"], int) for _ in [dat, dat2]))

        self.assertFalse(all(isinstance(k[-1], int) for k in dat.moger))
        self.assertTrue(any(isinstance(k[-1], int) for k in dat.moger))
        self.assertTrue(all(isinstance(k[-1], int) for k in dat2.moger))
        self.assertFalse(any(isinstance(k[-1], int) for k in dat3.moger))
示例#11
0
    def testSqlSpaceyTwo(self):
        if not self.can_run:
            return
        self.assertTrue(pandatio.sql,
                        "this unit test requires SQLite installed")

        tdf = TicDatFactory(**spacesSchema())
        pdf = PanDatFactory(**spacesSchema())
        ticDat = tdf.TicDat(
            **{
                "a_table": {
                    1: [1, 2, "3"],
                    22.2: (12, 0.12, "something"),
                    0.23: (11, 12, "thirt")
                },
                "b_table": {
                    (1, 2, "foo"): 1,
                    (1012.22, 4, "0012"): 12
                },
                "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5),
                            ("another", 5, 12.5, 24))
            })
        panDat = pan_dat_maker(spacesSchema(), ticDat)
        ext = ".db"
        filePath = os.path.join(_scratchDir, "spaces_2%s" % ext)
        with pandatio.sql.connect(filePath) as con:
            pdf.sql.write_file(panDat,
                               db_file_path=None,
                               con=con,
                               case_space_table_names=True)
        with pandatio.sql.connect(filePath) as con:
            panDat2 = pdf.sql.create_pan_dat(db_file_path=None, con=con)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext)
        with pandatio.sql.connect(filePath) as con:
            pdf.sql.write_file(panDat,
                               db_file_path="",
                               con=con,
                               case_space_table_names=True)
        with pandatio.sql.connect(filePath) as con:
            panDat2 = pdf.sql.create_pan_dat(None, con)
        self.assertTrue(pdf._same_data(panDat, panDat2))
示例#12
0
 def testDietWithInfFlagging(self):
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_tic_dat(dietData())
     tdf.set_infinity_io_flag(999999999)
     path = os.path.join(_scratchDir, "dietInfFlag")
     tdf.csv.write_directory(dat, path)
     dat_1 = tdf.csv.create_tic_dat(path)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = tdf.clone()
     dat_1 = tdf.csv.create_tic_dat(path)
     self.assertTrue(tdf._same_data(dat, dat_1))
     tdf = TicDatFactory(**dietSchema())
     dat_1 = tdf.csv.create_tic_dat(path)
     self.assertFalse(tdf._same_data(dat, dat_1))
示例#13
0
    def testNetflow(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**netflowSchema())
        ticDat = tdf.TicDat(
            **{t: getattr(netflowData(), t)
               for t in tdf.primary_key_fields})
        self._test_generic_copy(ticDat, tdf)
        self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"])
        dirPath = os.path.join(_scratchDir, "netflow")
        tdf.csv.write_directory(ticDat, dirPath)
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertFalse(tdf.csv.find_duplicates(dirPath))
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))
        csvTicDat = tdf.csv.create_tic_dat(dirPath,
                                           freeze_it=True,
                                           headers_present=False)
        self.assertFalse(tdf._same_data(ticDat, csvTicDat))
        tdf.csv.write_directory(ticDat,
                                dirPath,
                                write_header=False,
                                allow_overwrite=True)
        self.assertTrue(
            self.firesException(
                lambda: tdf.csv.create_tic_dat(dirPath, freeze_it=True)))
        csvTicDat = tdf.csv.create_tic_dat(dirPath,
                                           headers_present=False,
                                           freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))

        # the casting to floats is controlled by data types and default values
        ticDat.nodes[12] = {}
        tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True)
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertFalse(tdf._same_data(ticDat, csvTicDat))
        tdf2 = TicDatFactory(**netflowSchema())
        tdf2.set_data_type("nodes",
                           "name",
                           strings_allowed='*',
                           number_allowed=True)
        csvTicDat = tdf2.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))

        del (ticDat.nodes[12])
        ticDat.nodes['12'] = {}
        self.assertTrue(
            firesException(lambda: tdf.csv.write_directory(ticDat, dirPath)))
        tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True)
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))
示例#14
0
    def testXlsSimple(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "diet.xlsx")
        pdf.xls.write_file(panDat, filePath)
        xlsPanDat = pdf.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, xlsPanDat))

        pdf_shrunk = PanDatFactory(**{
            k: v
            for k, v in dietSchema().items() if k != "nutritionQuantities"
        })
        self.assertTrue(len(pdf_shrunk.all_tables) == len(pdf.all_tables) - 1)
        xlsPanDatShrunk = pdf_shrunk.xls.create_pan_dat(filePath)
        self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk))
        filePathShrunk = os.path.join(_scratchDir, "diet_shrunk.xlsx")
        self.assertTrue(
            self.firesException(
                lambda: pdf.xls.create_pan_dat(filePathShrunk)))
        pdf_shrunk.xls.write_file(panDat, filePathShrunk)
        xlsPanDatShrunk = pdf.xls.create_pan_dat(filePathShrunk)
        self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk))

        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        pdf2.xls.write_file(panDat, filePath)
        xlsPanDat = pdf2.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, xlsPanDat))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        filePath = os.path.join(_scratchDir, "netflow.xlsx")
        pdf.xls.write_file(panDat, filePath)
        panDat2 = pdf.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        xlsPanDat = pdf2.xls.create_pan_dat(filePath)
        self.assertTrue(pdf._same_data(panDat, xlsPanDat))
示例#15
0
 def testDups(self):
     if not _can_accdb_unit_test:
         return
     tdf = TicDatFactory(one = [["a"],["b, c"]],
                         two = [["a", "b"],["c"]],
                         three = [["a", "b", "c"],[]])
     tdf2 = TicDatFactory(**{t:[[],["a", "b", "c"]] for t in tdf.all_tables})
     td = tdf2.TicDat(**{t:[[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], [11, 1, 2]]
                         for t in tdf.all_tables})
     f = makeCleanPath(os.path.join(_scratchDir, "testDups.accdb"))
     tdf2.mdb.write_file(td, f)
     #shutil.copy(f, "dups.accdb") #uncomment to make readonly test file as .accdb
     dups = tdf.mdb.find_duplicates(f)
     self.assertTrue(dups ==  {'three': {(1, 2, 2): 2}, 'two': {(1, 2): 3}, 'one': {1: 3, 2: 2}})
示例#16
0
    def testCsvSimple(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "diet_csv")
        pdf.csv.write_directory(panDat, dirPath)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        panDat2 = pdf2.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "netflow_csv")
        pdf.csv.write_directory(panDat, dirPath)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        pdf2.csv.write_directory(panDat, dirPath)
        panDat2 = pdf2.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "diet_csv")
        pdf.csv.write_directory(panDat, dirPath, decimal=",")
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertFalse(pdf._same_data(panDat, panDat2))
        panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",")
        self.assertTrue(pdf._same_data(panDat, panDat2))
示例#17
0
 def testSortedTables(self):
     test1 = TicDatFactory(table3=[["PK3", "FK1", "FK2"], ["Val D"]],
                           table2=[["PK2"], ["Val A", "Val B"]],
                           table1=[["PK1"], ["Val C"]])
     test1.add_foreign_key("table3", "table1", ["FK1", "PK1"])
     test1.add_foreign_key("table3", "table2", ["FK2", "PK2"])
     self.assertTrue(tlingo._sorted_tables(test1)[-1] == 'table3')
示例#18
0
 def testSimplest(self):
     if not _can_unit_test:
         return
     tdf = TicDatFactory(simple_table=[["pk1"], ["df1", "df2"]])
     dat = tdf.mdb.create_tic_dat("simplest.accdb")
     self.assertTrue(
         len(dat.simple_table) == 3 and dat.simple_table[3]["df2"] == 2)
示例#19
0
    def testSpacey(self):
        if not _can_unit_test:
            return
        tdf = TicDatFactory(**spacesSchema())
        spacesData = {
            "a_table": {
                1: {
                    "a Data 3": 3,
                    "a Data 2": 2,
                    "a Data 1": 1
                },
                22: (1.1, 12, 12),
                0.23: (11, 12, 11)
            },
            "b_table": {
                ("1", "2", "3"): 1,
                ("a", "b", "b"): 12
            },
            "c_table": (("1", "2", "3", 4), {
                "c Data 4": 55,
                "c Data 2": "b",
                "c Data 3": "c",
                "c Data 1": "a"
            }, ("a", "b", "12", 24))
        }

        dat = tdf.TicDat(**spacesData)
        filePath = "spaces.accdb"
        self.assertFalse(tdf.mdb.find_duplicates(filePath))
        dat2 = tdf.mdb.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(tdf._same_data(dat, dat2))
示例#20
0
    def testDiet(self):
        if not _can_accdb_unit_test:
            return
        tdf = TicDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        filePath = makeCleanPath(os.path.join(_scratchDir, "diet.accdb"))
        tdf.mdb.write_file(ticDat, filePath)
        #shutil.copy(filePath, "diet.accdb") #uncomment to make readonly test file as .accdb
        self.assertFalse(tdf.mdb.find_duplicates(filePath))
        accdbTicDat = tdf.mdb.create_tic_dat(filePath)
        self.assertTrue(tdf._same_data(ticDat, accdbTicDat))

        def changeit():
            accdbTicDat.categories["calories"]["minNutrition"] = 12

        changeit()
        self.assertFalse(tdf._same_data(ticDat, accdbTicDat))

        self.assertTrue(
            self.firesException(lambda: tdf.mdb.write_file(ticDat, filePath)))
        tdf.mdb.write_file(ticDat, filePath, allow_overwrite=True)
        accdbTicDat = tdf.mdb.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, accdbTicDat))
        self.assertTrue(self.firesException(changeit))
        self.assertTrue(tdf._same_data(ticDat, accdbTicDat))
示例#21
0
    def testSpacey(self):
        if not _can_accdb_unit_test:
            return
        tdf = TicDatFactory(**spacesSchema())
        spacesData =  {
        "a_table" : {1 : {"a Data 3":3, "a Data 2":2, "a Data 1":1},
                     22 : (1.1, 12, 12), 0.23 : (11, 12, 11)},
        "b_table" : {("1", "2", "3") : 1, ("a", "b", "b") : 12},
        "c_table" : (("1", "2", "3", 4),
                      {"c Data 4":55, "c Data 2":"b", "c Data 3":"c", "c Data 1":"a"},
                      ("a", "b", "12", 24) ) }

        dat = tdf.TicDat(**spacesData)
        filePath = makeCleanPath(os.path.join(_scratchDir, "spacey.accdb"))
        tdf.mdb.write_schema(filePath, a_table = {"a Field":"double"},
                                       c_table = {"c Data 1":"text", "c Data 2":"text",
                                                  "c Data 3":"text", "c Data 4":"int"})
        tdf.mdb.write_file(dat, filePath)
        self.assertFalse(tdf.mdb.find_duplicates(filePath))
        dat2 = tdf.mdb.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(tdf._same_data(dat,dat2))

        with py.connect(_connection_str(filePath)) as con:
            for t in tdf.all_tables:
                con.cursor().execute("SELECT * INTO [%s] FROM %s"%(t.replace("_", " "), t)).commit()
                con.cursor().execute("DROP TABLE %s"%t).commit()
        #shutil.copy(filePath, "spaces.accdb") #uncomment to make readonly test file as .accdb
        dat3 = tdf.mdb.create_tic_dat(filePath, freeze_it=True)
        self.assertTrue(tdf._same_data(dat, dat3))
示例#22
0
    def testNetflow(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        oldDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields}))
        self._test_generic_free_copy(oldDat, tdf)
        self._test_generic_free_copy(oldDat, tdf, ["arcs", "nodes"])
        ticDat = tdf.copy_to_pandas(oldDat, ["arcs", "cost"])
        self.assertTrue(all(hasattr(ticDat, t) == (t in ["arcs", "cost"]) for t in tdf.all_tables))
        self.assertTrue(len(ticDat.arcs.capacity.sloc["Boston",:]) == len(oldDat.nodes["Boston"].arcs_source) == 0)
        self.assertTrue(len(ticDat.arcs.capacity.sloc[:,"Boston"]) == len(oldDat.nodes["Boston"].arcs_destination) == 2)
        self.assertTrue(all(ticDat.arcs.capacity.sloc[:,"Boston"][src] == r["capacity"]
                            for src, r in oldDat.nodes["Boston"].arcs_destination.items()))
        ticDat = tdf.copy_to_pandas(oldDat, drop_pk_columns=True)
        rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables})
        # because we have single pk field tables, dropping the pk columns is probelmatic
        self.assertFalse(tdf._same_data(rebornTicDat, oldDat))

        # but with the default argument all is well
        ticDat = tdf.copy_to_pandas(oldDat)
        rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables})
        self.assertTrue(tdf._same_data(rebornTicDat, oldDat))
        self.assertTrue(set(ticDat.inflow.columns) == {"quantity"})
        self.assertTrue(set(ticDat.nodes.columns) == {"name"})
示例#23
0
 def testCaseSpaceTableNames(self):
     tdf = TicDatFactory(table_one=[["a"], ["b", "c"]],
                         table_two=[["this", "that"], []])
     dir_path = os.path.join(_scratchDir, "case_space")
     dat = tdf.TicDat(table_one=[['a', 2, 3], ['b', 5, 6]],
                      table_two=[["a", "b"], ["c", "d"], ["x", "z"]])
     tdf.csv.write_directory(dat,
                             makeCleanDir(dir_path),
                             case_space_table_names=True)
     self.assertTrue(
         all(
             os.path.exists(os.path.join(dir_path, _ + ".csv"))
             for _ in ["Table One", "Table Two"]))
     self.assertFalse(
         any(
             os.path.exists(os.path.join(dir_path, _ + ".csv"))
             for _ in ["table_one", "table_two"]))
     self.assertTrue(tdf._same_data(dat, tdf.csv.create_tic_dat(dir_path)))
     tdf.csv.write_directory(dat,
                             makeCleanDir(dir_path),
                             case_space_table_names=False)
     self.assertFalse(
         any(
             os.path.exists(os.path.join(dir_path, _ + ".csv"))
             for _ in ["Table One", "Table Two"]))
     self.assertTrue(
         all(
             os.path.exists(os.path.join(dir_path, _ + ".csv"))
             for _ in ["table_one", "table_two"]))
     self.assertTrue(tdf._same_data(dat, tdf.csv.create_tic_dat(dir_path)))
示例#24
0
    def testDietCleaningOpalytisThree(self):
        tdf = TicDatFactory(**dietSchema())
        tdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(tdf)
        ticDat = tdf.copy_tic_dat(dietData())

        pdf = PanDatFactory(**tdf.schema())
        pdf.add_data_row_predicate("categories",
                                   lambda row: row["maxNutrition"] >= 66)
        addDietForeignKeys(pdf)

        input_set = create_inputset_mock(tdf, ticDat)

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.categories.pop("fat")
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
        tdf.remove_foreign_key_failures(ticDat)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
示例#25
0
 def testDietWithInfFlagging(self):
     diet_pdf = PanDatFactory(**dietSchema())
     addDietDataTypes(diet_pdf)
     tdf = TicDatFactory(**dietSchema())
     dat = tdf.copy_to_pandas(tdf.copy_tic_dat(dietData()),
                              drop_pk_columns=False)
     diet_pdf.set_infinity_io_flag(999999999)
     core_path = os.path.join(_scratchDir, "diet_with_inf_flagging")
     diet_pdf.sql.write_file(dat, core_path + ".db")
     diet_pdf.csv.write_directory(dat, core_path + "_csv")
     diet_pdf.json.write_file(dat, core_path + ".json")
     diet_pdf.xls.write_file(dat, core_path + ".xlsx")
     for attr, f in [["sql", core_path + ".db"],
                     ["csv", core_path + "_csv"],
                     ["json", core_path + ".json"],
                     ["xls", core_path + ".xlsx"]]:
         dat_1 = getattr(diet_pdf, attr).create_pan_dat(f)
         self.assertTrue(diet_pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = diet_pdf.clone()
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
         pdf = PanDatFactory(**diet_pdf.schema())
         dat_1 = getattr(pdf, attr).create_pan_dat(f)
         self.assertFalse(pdf._same_data(dat, dat_1, epsilon=1e-5))
         protein = dat_1.categories["name"] == "protein"
         self.assertTrue(
             list(dat_1.categories[protein]["maxNutrition"])[0] ==
             999999999)
         dat_1.categories.loc[protein, "maxNutrition"] = float("inf")
         self.assertTrue(pdf._same_data(dat, dat_1, epsilon=1e-5))
示例#26
0
    def testDenormalizedErrors(self):
        if not self.canRun:
            return
        c = clean_denormalization_errors
        f = utils.find_denormalized_sub_table_failures
        tdf = TicDatFactory(**spacesSchema())
        dat = tdf.TicDat(**spacesData())
        p = lambda :tdf.copy_to_pandas(dat, drop_pk_columns=False).b_table
        self.assertFalse(f(p(),"b Field 1",("b Field 2", "b Field 3")))
        dat.b_table[2,2,3] = "boger"
        self.assertFalse(f(p(), "b Field 1",("b Field 2", "b Field 3")))
        chk = f(p(), "b Field 2",("b Field 1", "b Field 3"))
        self.assertTrue(c(chk) == {2: {'b Field 1': {1, 2}}})
        dat.b_table[2,2,4] = "boger"
        dat.b_table[1,'b','b'] = "boger"
        chk = f(p(), ["b Field 2"],("b Field 1", "b Field 3", "b Data"))
        self.assertTrue(c(chk) == c({2: {'b Field 3': (3, 4), 'b Data': (1, 'boger'), 'b Field 1': (1, 2)},
                                 'b': {'b Data': ('boger', 12), 'b Field 1': ('a', 1)}}))

        ex = self.firesException(lambda : f(p(), ["b Data"],"wtf"))
        self.assertTrue("wtf isn't a column" in ex)


        p = lambda :tdf.copy_to_pandas(dat, drop_pk_columns=False).c_table
        chk = f(p(), pk_fields=["c Data 1", "c Data 2"], data_fields=["c Data 3", "c Data 4"])
        self.assertTrue(c(chk) == {('a', 'b'): {'c Data 3': {'c', 12}, 'c Data 4': {24, 'd'}}})
        dat.c_table.append((1, 2, 3, 4))
        dat.c_table.append((1, 2, 1, 4))
        dat.c_table.append((1, 2, 1, 5))
        dat.c_table.append((1, 2, 3, 6))
        chk = f(p(), pk_fields=["c Data 1", "c Data 2"], data_fields=["c Data 3", "c Data 4"])
        self.assertTrue(c(chk) == {('a', 'b'): {'c Data 3': {'c', 12}, 'c Data 4': {24, 'd'}},
                                   (1,2):{'c Data 3':{3,1}, 'c Data 4':{4,5,6}}})
示例#27
0
    def testSillyCleaningOpalyticsOne(self):
        tdf = TicDatFactory(**sillyMeSchema())
        tdf.set_data_type("c",
                          "cData4",
                          number_allowed=False,
                          strings_allowed=['d'])
        ticDat = tdf.TicDat(**sillyMeData())

        input_set = create_inputset_mock(tdf, ticDat)

        pdf = PanDatFactory(**sillyMeSchema())
        pdf.set_data_type("c",
                          "cData4",
                          number_allowed=False,
                          strings_allowed=['d'])

        panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True)
        self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat))

        panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False)
        self.assertFalse(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))

        ticDat.c.pop()
        ticDat.c.pop(0)
        self.assertTrue(
            tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
示例#28
0
    def testDateTime(self):
        tdf = TicDatFactory(table_with_stuffs=[["field one"], ["field two"]],
                            parameters=[["a"], ["b"]])
        tdf.add_parameter("p1", "Dec 15 1970", datetime=True)
        tdf.add_parameter("p2", None, datetime=True, nullable=True)
        tdf.set_data_type("table_with_stuffs", "field one", datetime=True)
        tdf.set_data_type("table_with_stuffs",
                          "field two",
                          datetime=True,
                          nullable=True)

        dat = tdf.TicDat(table_with_stuffs=[[
            "July 11 1972", None
        ], [datetime.datetime.now(),
            dateutil.parser.parse("Sept 11 2011")]],
                         parameters=[["p1", "7/11/1911"], ["p2", None]])
        self.assertFalse(
            tdf.find_data_type_failures(dat)
            or tdf.find_data_row_failures(dat))

        file_one = os.path.join(_scratchDir, "datetime.json")
        tdf.json.write_file(dat, file_one)
        dat_1 = tdf.json.create_tic_dat(file_one)
        self.assertFalse(tdf._same_data(dat, dat_1))
        self.assertTrue(
            isinstance(dat_1.parameters["p1"]["b"], datetime.datetime))
        self.assertTrue(
            all(
                isinstance(_, datetime.datetime)
                for _ in dat_1.table_with_stuffs))
        self.assertTrue(
            all(
                isinstance(_, datetime.datetime) or _ is None
                for v in dat_1.table_with_stuffs.values() for _ in v.values()))
示例#29
0
 def testColumnsWithoutData(self):
     tdf = TicDatFactory(data=[["a"], ["b"]])
     for x in ["", "x"]:
         file = os.path.join(_scratchDir, "no_data.xls" + x)
         tdf.xls.write_file(tdf.TicDat(), file)
         dat = tdf.xls.create_tic_dat(file)
         self.assertFalse(dat._len_dict())
示例#30
0
    def testNetflow(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**netflowSchema())
        ticDat = tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})
        self._test_generic_copy(ticDat, tdf)
        self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"])
        dirPath = os.path.join(_scratchDir, "netflow")
        tdf.csv.write_directory(ticDat, dirPath)
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertFalse(tdf.csv.find_duplicates(dirPath))
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it= True, headers_present=False)
        self.assertFalse(tdf._same_data(ticDat, csvTicDat))
        tdf.csv.write_directory(ticDat, dirPath, write_header=False,allow_overwrite=True)
        self.assertTrue(self.firesException(lambda : tdf.csv.create_tic_dat(dirPath, freeze_it=True)))
        csvTicDat = tdf.csv.create_tic_dat(dirPath, headers_present=False, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))

        ticDat.nodes[12] = {}
        tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True)
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertTrue(tdf._same_data(ticDat, csvTicDat))

        # minor flaw - strings that are floatable get turned into floats when reading csvs
        del(ticDat.nodes[12])
        ticDat.nodes['12'] = {}
        self.assertTrue(firesException(lambda : tdf.csv.write_directory(ticDat, dirPath)))
        tdf.csv.write_directory(ticDat, dirPath, allow_overwrite=True)
        csvTicDat = tdf.csv.create_tic_dat(dirPath, freeze_it=True)
        self.assertFalse(tdf._same_data(ticDat, csvTicDat))