예제 #1
0
    def test_syncdf_dtypes_and_index(self, mock_updatedf, mock_insertdf, db):
        '''Tests that syncdf (calling DbManager) nor DbManager preserve the index'''
        d2006 = datetime(2006, 1, 1)
        d2007 = datetime(2007, 12, 31)
        d2008 = datetime(2008, 3, 14)
        d2009 = datetime(2009, 9, 25)
        df_data = [{'id': 1, 'name': 'a', 'time': d2006}, {'id': 2, 'name': 'a', 'time': d2008},
                   {'id': 3, 'name': 'a', 'time': None}]
        self.init_db(db.session, df_data)

        # Sync a dataframe with a new item N= [{'name': 'b', 'time': d2006}]
        # Index is preserved but only because the rows
        # to insert and to update are contiguous, e.g.
        # [row2beInserted1, ..., row2beInsertedN, row2beUpdated1, .., row2beUpdatedM]
        d = pd.DataFrame(df_data + [{'name': 'b', 'time': d2006}])
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id,
                   buf_size=1)
        # indices are preserved:
        assert (d.index == d2.index).all()
        # column dtypes not, as d2 is casted to the SQL type:
        assert d2['id'].dtype == np.int64
        assert d['id'].dtype == np.float64

        # Now we insert a new item in between.
        # Index is preserved because the rows
        # to insert and to update are NOT contiguous, i.e.
        # [row2beUpdated1, row2beInserted, row2beUpdated2]
        df_data = [{'id': 1, 'name': 'a', 'time': d2006},
                   {'name': 'xyz', 'time': None},
                   {'id': 2, 'name': 'a', 'time': d2008},
                   ]
        self.init_db(db.session, df_data)

        d = pd.DataFrame(df_data)
        # {'name': 'a', 'time': None} will be dropped (drop duplicates is True by default)
        # from the other three elements, the last is already existing and will not be added
        # the other two will have an autoincremented id:
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id,
                   buf_size=1)

        # indices are preserved:
        assert not (d.index == d2.index).all()
        # column dtypes not, as d2 is casted to the SQL type:
        assert d2['id'].dtype == np.int64
        assert d['id'].dtype == np.float64

        # Now try with DbManager (used in the underlying syncdf) and show that
        # indices are not preserved if we add several dataframes sequentially
        # (because we use a pd.concat):
        dbm = DbManager(db.session, Customer.id, update=True, buf_size=100,
                        return_df=True)
        for _ in range(len(d)):
            dbm.add(d[_:_+1])
        dbm.close()
        idx1 = d.index
        idx2 = dbm.dataframe.index
        assert len(idx1) == len(idx2)
        assert not (idx1.values == idx2.values).all()
예제 #2
0
 def test_syncdf_3(self, db):
     d2006 = datetime(2006, 1, 1)
     d2007 = datetime(2007, 12, 31)
     d2008 = datetime(2008, 3, 14)
     d2009 = datetime(2009, 9, 25)
     self.init_db(db.session,
                  [{'id': 1, 'name': 'a', 'time': d2006}, {'id': 2, 'name': 'a', 'time': d2008},
                   {'id': 3, 'name': 'a', 'time': None}])
     d = pd.DataFrame([{'id': 45, 'name': 'a', 'time': None},
                       {'id': 45, 'name': 'b', 'time': d2006},
                       {'id': 45, 'name': 'c', 'time': d2009},
                       {'id': 45, 'name': 'a', 'time': None},
                       {'id': 45, 'name': 'a', 'time': d2006}])
     # we have these already existing instances: 1st, fourth, last instances
     # first and fourth are duplicates, so as drop duplicates is True, so they will be REMOVED
     # We remain with the other three: first, 2nd and third. AS update is false, they will
     # always be returned:
     inserted, not_inserted, updated, not_updated, d2 = \
         syncdf(d, db.session, [Customer.name, Customer.time], Customer.id)
     # Note that in this case {'id':45, 'name': 'b', 'time': d2006}, and
     # {'id':45, 'name': 'c', 'time': d2009} are still valid and in d2 cause they don't have
     # matching on the table => their id is not updated and they don't have null id =>
     # they id is not set
     expected_ids = [45, 45, 1]
     assert array_equal(d2['id'], expected_ids)
예제 #3
0
 def test_syncdf_2_no_drop_dupes(self, db):
     """Same as `test_syncdf_2` but with drop duplicates False"""
     d2006 = datetime(2006, 1, 1)
     d2007 = datetime(2007, 12, 31)
     d2008 = datetime(2008, 3, 14)
     d2009 = datetime(2009, 9, 25)
     mxx = _get_max(db.session, Customer.id)
     self.init_db(db.session, 
                  [{'id': 1, 'name': 'a', 'time': d2006}, {'id': 2, 'name': 'a', 'time': d2008},
                   {'id': 3, 'name': 'a', 'time': None}])
     mxx = _get_max(db.session, Customer.id)
     d = pd.DataFrame([{'name': 'a', 'time': None}, {'name': 'b', 'time': d2006},
                       {'name': 'b', 'time': d2006}, {'name': 'a', 'time': None},
                       {'name': 'a', 'time': d2006}])
     # {'name': 'a', 'time': None} will NOT be dropped (drop duplicates is False here)
     # thus when fetching the pkey 'id' they will both have 3 (see above)
     #  {'name': 'b', 'time': d2006} will NOT be dropped as well and they will have two
     # different ids cause they do not have counterparts saved on the db
     # {'name': 'a', 'time': d2006} will have the 'id' 1 (see above):
     inserted, not_inserted, updated, not_updated, d2 = \
         syncdf(d, db.session, [Customer.name, Customer.time], Customer.id,
                keep_duplicates=True)
     # the returned dataframe, as it does NOT update, has the instances with already set id
     # first (these instances are those who should be updated) and then the rest, so it's
     # like this: 
     assert array_equal(d2['id'], [3, 3, 1, mxx+1, mxx+2])
예제 #4
0
    def test_syncdf_2(self, db):
        """Same as `test_syncdf` but the second non-existing item is conflicting with
        the first added"""
        d2006 = datetime(2006, 1, 1)
        d2007 = datetime(2007, 12, 31)
        d2008 = datetime(2008, 3, 14)
        d2009 = datetime(2009, 9, 25)
        mxx = _get_max(db.session, Customer.id)
        self.init_db(db.session, 
                     [{'id': 1, 'name': 'a', 'time': d2006}, {'id': 2, 'name': 'a', 'time': d2008},
                      {'id': 3, 'name': 'a', 'time': None}])
        mxx = _get_max(db.session, Customer.id)
        d = pd.DataFrame([{'name': 'a', 'time': None}, {'name': 'b', 'time': d2006},
                          {'name': 'b', 'time': d2006}, {'name': 'a', 'time': None},
                          {'name': 'a', 'time': d2006}])
        # {'name': 'a', 'time': None} will be dropped (drop duplicates is True by default)
        # from the other three elements, the last is already existing and will not be added
        # the other two have a conflict and will be dropped as duplicates as well:
        expected_ids = [1]
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id)
# d2 should be:
#           name       time   id
#         0    a        NaT  3.0
#         1    b 2006-01-01  4.0
#         4    a 2006-01-01  1.0
        assert array_equal(d2['id'], expected_ids)
        assert inserted == len(d2)-1  # assert one is already existing
예제 #5
0
def dbsyncdf(dataframe,
             session,
             matching_columns,
             autoincrement_pkey_col,
             update=False,
             buf_size=10,
             keep_duplicates=False,
             return_df=True,
             cols_to_print_on_err=None):
    """Calls `syncdf` and writes to the logger before returning the new dataframe.
    Raises a :class:`FailedDownload` if the returned dataframe is empty (no row saved)"""

    db_exc_logger = DbExcLogger(cols_to_print_on_err)

    inserted, not_inserted, updated, not_updated, dfr = \
        syncdf(dataframe, session, matching_columns, autoincrement_pkey_col, update,
               buf_size, keep_duplicates,
               db_exc_logger.failed_insert,  # on duplicates callback
               db_exc_logger.failed_insert,   # on insert err callback
               db_exc_logger.failed_update)   # on update err callback

    table = autoincrement_pkey_col.class_
    if dfr.empty:
        raise FailedDownload(
            formatmsg(
                "No row saved to table '%s'" % table.__tablename__,
                "unknown error, check log for details and db connection"))
    dblog(table, inserted, not_inserted, updated, not_updated)
    return dfr
예제 #6
0
    def test_syncdf(self, mock_updatedf, mock_insertdf, db):
        d2006 = datetime(2006, 1, 1)
        d2007 = datetime(2007, 12, 31)
        d2008 = datetime(2008, 3, 14)
        d2009 = datetime(2009, 9, 25)
        self.init_db(db.session,
                     [{'id': 1, 'name': 'a', 'time': d2006}, {'id': 2, 'name': 'a', 'time': d2008},
                      {'id': 3, 'name': 'a', 'time': None}])
        mxx = _get_max(db.session, Customer.id)

        d = pd.DataFrame([{'name': 'a', 'time': None}, {'name': 'b', 'time': d2006},
                          {'name': 'c', 'time': d2009}, {'name': 'a', 'time': None},
                          {'name': 'a', 'time': d2006}])
        # {'name': 'a', 'time': None} will be dropped (drop duplicates is True by default)
        # from the other three elements, the last is already existing and will not be added
        # the other two will have an autoincremented id:
        expected_ids = [1, mxx+1, mxx+2]
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id)

        assert array_equal(d2['id'], expected_ids)
        assert inserted == len(d2)-1  # assert one is already existing
        assert updated == not_updated == 0
        assert mock_insertdf.call_count == 1

        # not try with update=True. Pass the dataframe d and change the name to 'x' for all rows
        d.loc[:, ['name']] = 'x'
        # Drop duplicates is still true, so duplicates under [Customer.name, Customer.time]
        # will be dropped: This means only {'name': 'x', 'time': d2009}
        # (previously {'name': 'c', 'time': d2009}) will be inserted
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id, update=['name'])
        assert array_equal(d2['id'], [6])
        assert inserted == 1  # assert one is already existing
        assert updated == not_updated == 0

        # same as above, but fetch data from the db:
        d = dbquery2df(db.session.query(Customer.id, Customer.name, Customer.time))
        # assert we do not have instances with 'x':
        dbase = dbquery2df(db.session.query(Customer.id,
                                            Customer.name,
                                            Customer.time).filter(Customer.name == 'x'))
        d.loc[:, ['name']] = 'x'
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id, update=['name'])
        # assert we returned all excpected instances
        assert d.drop_duplicates(subset=['name', 'time'], keep=False).equals(d2)
        # assert returned values
        assert (inserted, not_inserted, updated, not_updated) == (0, 0, 2, 0)
        # assert on the db we have the previously marked instances with 'x', PLUS the
        # added now
        dbase2 = dbquery2df(db.session.query(Customer.id,
                                             Customer.name,
                                             Customer.time).filter(Customer.name == 'x'))
        # and now assert it:
        oldids_with_x_as_name = dbase['id'].values.tolist()
        newids_with_x_as_name = d2['id'].values.tolist()
        currentids_with_x_as_name = dbase2['id'].values.tolist()
        assert sorted(oldids_with_x_as_name+newids_with_x_as_name) == \
            sorted(currentids_with_x_as_name)

        # same as above, but with drop_duplicates=False
        d = dbquery2df(db.session.query(Customer.id, Customer.name, Customer.time))
        # assert we do not have instances with 'x':
        dbase = dbquery2df(db.session.query(Customer.id,
                                            Customer.name,
                                            Customer.time).filter(Customer.name == 'w'))
        d.loc[:, ['name']] = 'w'
        inserted, not_inserted, updated, not_updated, d2 = \
            syncdf(d, db.session, [Customer.name, Customer.time], Customer.id, update=['name'],
                   keep_duplicates=True)
        # we should have the same result of the database, as we updated ALL instances:
        # BUT: fetchsetpkeys changed to float the dtype of id, so:
        assert not d.equals(d2)
        # this should work
        d['id'] = d['id'].astype(int)
        assert d.equals(d2)
        # assert returned values
        assert (inserted, not_inserted, updated, not_updated) == (0, 0, len(d), 0)
예제 #7
0
    def test_syncdf_several_combinations(self, mock_updatedf, mock_insertdf, db):
        dt = datetime(2006, 1, 1, 12, 31, 7, 456789)
        EXISTING_ID = 1
        db_df = [{'id': EXISTING_ID, 'name': 'a', 'time': None}]
        dfs = [
                # all rows existing:
                pd.DataFrame([{'name': 'a', 'time': None}]),
                # no row existing:
                pd.DataFrame([{'name': 'b', 'time': None}]),
                # some exist, some not
                pd.DataFrame([{'name': 'a', 'time': None}, {'name': 'b', 'time': dt}]),
                # some exist (inserted twice), some not:
                pd.DataFrame([{'name': 'a', 'time': None}, {'name': 'a', 'time': None},
                              {'name': 'b', 'time': dt}]),
                # some exust, some note (inserted twice):
                pd.DataFrame([{'name': 'a', 'time': None}, {'name': 'b', 'time': dt},
                              {'name': 'b', 'time': dt}]),
            ]

        for keep_dup in [False, True]:
            for update in [True, False]:
                for i, d in enumerate(dfs):
                    # re-initialize db every time:
                    db.session.query(Customer).delete()
                    self.init_db(db.session, db_df)
                    assert len(db.session.query(Customer.id).all()) == len(db_df)

                    inserted, not_inserted, updated, not_updated, d2 = \
                        syncdf(d.copy(), db.session, [Customer.name, Customer.time], Customer.id,
                               keep_duplicates=keep_dup, update=update)

                    # SYNC 1 ROW WHICH EXISTS ON THE DB
                    if i == 0:
                        if update:
                            assert (inserted, not_inserted, updated, not_updated) == (0, 0, 1, 0)
                        else:
                            assert (inserted, not_inserted, updated, not_updated) == (0, 0, 0, 0)
                        assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID

                    # SYNC 1 ROW WHICH DOES NOT EXIST ON THE DB
                    elif i == 1:
                        assert (inserted, not_inserted, updated, not_updated) == (1, 0, 0, 0)
                        assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID + 1

                    # SYNC 2 ROWS: ONE EXISTS ON THE DB THE OTHER NOT
                    elif i == 2:
                        if update:
                            assert (inserted, not_inserted, updated, not_updated) == (1, 0, 1, 0)
                        else:
                            assert (inserted, not_inserted, updated, not_updated) == (1, 0, 0, 0)
                        assert len(d2) == 2 and sorted(d2['id'].values.tolist()) == \
                            [EXISTING_ID, EXISTING_ID+1]

                    # SYNC 3 ROWS: TWO EXISTS AND ARE THE SAME (DUPLICATED),
                    # THE OTHER DOES NOT EXIST ON THE DB
                    elif i == 3:
                        if not keep_dup:  # drop duplicates (all)
                            # rows duplicates  will be dropped,
                            # it remains only the already existing row, thus this is the
                            # same as no row existing (i==1)
                            assert (inserted, not_inserted, updated, not_updated) == (1, 0, 0, 0)
                            assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID + 1
                        else:
                            if update:
                                # rows duplicates will NOT be dropped, They exist, hence,
                                # their id will be fetched and
                                # they will NOT be inserted, thus they will be updated.
                                # The other row will be inserted, thus will be updated
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (1, 0, 2, 0)
                            else:
                                # rows duplicates will NOT be dropped, Their id will be fetched and
                                # they will NOT be inserted. The other row will be inserted
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (1, 0, 0, 0)

                            assert len(d2) == 3 and sorted(d2['id'].values.tolist()) == \
                                [EXISTING_ID, EXISTING_ID, EXISTING_ID+1]

                    # SYNC 3 ROWS: ONE EXIST ON THE DB, TWO DO NOT EXISTS AND ARE THE SAME (DUPLICATED)
                    elif i == 4:
                        if not keep_dup:  # drop duplicates (all)
                            if update:
                                # rows duplicates will be dropped,  it remains only the already
                                # existing row, thus this is the same as all row existing (i==0)
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (0, 0, 1, 0)
                            else:
                                # rows duplicates will be dropped,
                                # it remains only the already existing row,
                                # thus this is the same as all row existing (i==0)
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (0, 0, 0, 0)
                            assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID
                        else:
                            if update:
                                # rows duplicates will NOT be dropped. They do not exist,
                                # hence their id will not be fetched thus it will be assigned
                                # incrementally. The other row will be updated (it exists)
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (2, 0, 1, 0)
                            else:
                                # rows duplicates will NOT be dropped, Their id will not be
                                # fetched thus it will be assigned incrementally
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (2, 0, 0, 0)
                            assert len(d2) == 3 and sorted(d2['id'].values.tolist()) == \
                                [EXISTING_ID, EXISTING_ID+1, EXISTING_ID+2]
예제 #8
0
    def test_syncdf_several_combinations_constraints(self, mock_updatedf, mock_insertdf, db):
        dt = datetime(2006, 1, 1, 12, 31, 7, 456789)
        EXISTING_ID = 1
        db_df = [{'id': EXISTING_ID, 'name': 'a', 'time': None}]
        dfs = [
                # non existing, violates constraint (name NULL):
                pd.DataFrame([{'name': None, 'time': None}]),
                # existing, violates constraint (name NULL):
                pd.DataFrame([{'id': 1, 'name': None, 'time': None}]),
                # non existing, violates constraint (name NULL) and non existing (OK)
                pd.DataFrame([{'name': None, 'time': None}, {'name': 'x', 'time': None}]),
                # existing, violates constraint (name NULL)  and non existing (OK)
                pd.DataFrame([{'id': 1, 'name': None, 'time': None}, {'name': 'x', 'time': None}]),
                # non existing, violates constraint (name NULL) and existing (OK)
                pd.DataFrame([{'name': None, 'time': None}, {'name': 'a', 'time': None}]),
                # existing, violates constraint (name NULL)  and existing (OK)
                pd.DataFrame([{'id': 1, 'name': None, 'time': None}, {'name': 'a', 'time': None}]),
            ]

        # stupid hack to access these vars from within functions below
        # Python 3 has better ways, we still need python2 compatibility:
        inserterr_callcount = [0]
        updateerr_callcount = [0]

        def onerri(dataframe, exc):
            assert isinstance(dataframe, pd.DataFrame)
            assert isinstance(exc, SQLAlchemyError)
            inserterr_callcount[0] += 1

        def onerru(dataframe, exc):
            assert isinstance(dataframe, pd.DataFrame)
            assert isinstance(exc, SQLAlchemyError)
            updateerr_callcount[0] += 1

        for update in [True]:
            for buf_size in [1]:
                for i, d in enumerate(dfs):
                    # re-initialize db every time:
                    inserterr_callcount[0] = 0
                    updateerr_callcount[0] = 0
                    db.session.query(Customer).delete()
                    self.init_db(db.session, db_df)
                    assert len(db.session.query(Customer.id).all()) == len(db_df)

                    inserted, not_inserted, updated, not_updated, d2 = \
                        syncdf(d.copy(), db.session, [Customer.name, Customer.time], Customer.id,
                               buf_size=buf_size, update=update, oninsert_err_callback=onerri,
                               onupdate_err_callback=onerru)

                    if not_inserted:
                        assert inserterr_callcount[0] > 0
                    else:
                        assert inserterr_callcount[0] == 0
                    if not_updated:
                        assert updateerr_callcount[0] > 0
                    else:
                        assert updateerr_callcount[0] == 0

                    # SYNC 1 ROW WHICH DOES NOT EXIST AND VIOLATES CONSTRAINTS
                    if i == 0:
                        assert (inserted, not_inserted, updated, not_updated) == (0, 1, 0, 0)
                        assert len(d2) == 0

                    # SYNC 1 ROW WHICH EXISTS AND VIOLATES CONSTRAINTS
                    elif i == 1:
                        if update:
                            assert (inserted, not_inserted, updated, not_updated) == (0, 0, 0, 1)
                            assert len(d2) == 0
                        else:
                            assert (inserted, not_inserted, updated, not_updated) == (0, 0, 0, 0)
                            assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID

                    # SYNC 2 ROWS: ONE DOES NOT EXIST (AND VIOLATES COSNTRAINTS),
                    # THE OTHER DOES NOT EXIST (AND IS OK)
                    elif i == 2:
                        if buf_size != 1:
                            # bad instances fall in the same chunk and thus are not inserted
                            assert (inserted, not_inserted, updated, not_updated) == (0, 2, 0, 0)
                            assert len(d2) == 0
                        else:
                            # bad instances fall NOT in the same chunk and thus are not inserted
                            assert (inserted, not_inserted, updated, not_updated) == (1, 1, 0, 0)
                            # the only instance has not EXISTING_ID+1 (that was discarded) but
                            # EXISTING_ID+2:
                            assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID+2

                    # SYNC 2 ROWS: ONE EXISTS (AND VIOLATES COSNTRAINTS),
                    # THE OTHER DOES NOT EXIST (AND IS OK)
                    elif i == 3:
                        if update:
                            assert (inserted, not_inserted, updated, not_updated) == (1, 0, 0, 1)
                            assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID + 1
                        else:
                            assert (inserted, not_inserted, updated, not_updated) == (1, 0, 0, 0)
                            assert len(d2) == 2 and sorted(d2['id'].values.tolist()) == \
                                [EXISTING_ID, EXISTING_ID+1]

                    # SYNC 2 ROWS: ONE DOES NOT EXIST (AND VIOLATES COSNTRAINTS),
                    # THE OTHER EXISTS (AND IS OK)
                    elif i == 4:
                        if update:
                            assert (inserted, not_inserted, updated, not_updated) == (0, 1, 1, 0)
                        else:
                            assert (inserted, not_inserted, updated, not_updated) == (0, 1, 0, 0)
                        assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID

                    # SYNC 2 ROWS: ONE EXISTS (AND VIOLATES COSNTRAINTS),
                    # THE OTHER EXISTS (AND IS OK)
                    elif i == 5:
                        if update:
                            if buf_size != 1:
                                # both not updated (same update chunk, which fails):
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (0, 0, 0, 2)
                                assert len(d2) == 0
                            else:
                                # the first updated, the second not
                                assert (inserted, not_inserted, updated, not_updated) == \
                                    (0, 0, 1, 1)
                                assert len(d2) == 1 and d2['id'].iloc[0] == EXISTING_ID
                        else:
                            assert (inserted, not_inserted, updated, not_updated) == \
                                (0, 0, 0, 0)
                            assert len(d2) == 2 and sorted(d2['id'].values.tolist()) == \
                                [EXISTING_ID, EXISTING_ID]