示例#1
0
def _save_changes_to_db(data):
    changes = _get_changed_rows(data)
    if len(changes) == 0:
        raise dash.exceptions.PreventUpdate("No changes to save")
    trx_ids = [c['id'] for c in changes]
    trxs = get_transactions_by_id(ids=trx_ids)
    for c, trx in zip(changes, trxs):
        these_changes = _get_changed_columns(c[CHANGED_COLUMN])

        if these_changes != set([CATEGORY, CATEGORY_ID]):
            raise ValueError(
                "Invalid change - only changes to category column supported")

        if c[CATEGORY] is None:
            # Manually deleted category.  Remove existing accepted category and move on
            trx.category = None
            # trx.category_id = None  # This should be redundant
        elif c[CATEGORY_ID] is None:
            # Manually entered - create new Category and attach
            category = Category(scheme=ACCEPTED_CATEGORY, category=c[CATEGORY])
            trx.category = category
        else:
            # Accepted a suggestion.  Reuse this category by attaching to .category.
            category = get_category_by_id(id=c[CATEGORY_ID])
            trx.category = category
    # To commit these objects, which came from a different session that I no longer have, merge them into a new
    # session. (from https://stackoverflow.com/a/47663833/5394584)
    # Is there a better pattern I could use for passing objects that avoids this?  Start here if current solution
    # gives problems:
    # https://stackoverflow.com/questions/48218065/programmingerror-sqlite-objects-created-in-a-thread-can-only-be-used-in-that-sa
    s = create_session()
    trxs_to_commit = [s.merge(trx) for trx in trxs]
    s.add_all(trxs_to_commit)
    s.commit()
    s.close()
示例#2
0
def get_unique_transaction_categories_as_string(category_type='all') -> List[str]:
    """
    Returns list of all unique categories used in the transaction table as string category labels

    Args:
        category_type (str): all: returns all unique category names from the category table
                             accepted: returns only category names from "accepted" categories in the transaction table
    """
    s = create_session()
    if category_type == 'all':
        categories = s.query(Category.category).distinct().all()
    elif category_type == 'accepted':
        accepted_cat_ids = flatten(s.query(Transaction.category_id).all())
        categories = (s.query(Category.category)
                       .filter(Category.id.in_(accepted_cat_ids))
                       .distinct()
                       .all()
                      )
    else:
        raise ValueError(f"Unsupported category_type '{category_type}'")
    s.close()

    # Category is just the first element of the tuples returned
    categories = [x[0] for x in categories]
    return categories
示例#3
0
def classify_db_by_lookup(label_file, classify_if_not_null=False):
    raise ValueError("Need to review this.  I think this does not fit the current workflow.  Should make suggested clf")
    clf = LookupClassifier.from_csv(label_file)

    if classify_if_not_null:
        trxs = get_transactions()
    else:
        trxs = get_transactions_without_category()
    
    descriptions = [trx.description for trx in trxs]
    categories = clf.predict(descriptions)

    # Edit transactions here, but they're disconnected from db so need to "add" them after
    trxs_to_push = []
    for trx, category in zip(trxs, categories):
        # I think this avoids recommitting things that dont get changed.  But not sure
        if category is not None:
            trx.category = category
            trxs_to_push.append(trx)

    # Update anything we changed
    if trxs_to_push:
        s = create_session()
        s.add_all(trxs_to_push)
        s.commit()
示例#4
0
def db_with_unclassified(db_init):
    s = create_session()
    # TODO: Update this to use label table
    trxs = [
        Transaction(description="Ignore0", category="something"),
        Transaction(description="NoCatAvail0"),
    ]
    for labeled_trx in LABELED_TRANSACTIONS:
        description = labeled_trx["Description"]
        trxs.append(Transaction(description=description))
    s.add_all(trxs)
    s.commit()
示例#5
0
def get_categories(scheme=None):
    """
    Returns all category entries, optionally filtered

    Args:
        scheme (str): Scheme name to match

    Returns:
        List of Category instances
    """
    s = create_session()
    q = s.query(Category)
    if scheme:
        q = q.filter(Category.scheme == scheme)
    s.close()
    return q.all()
示例#6
0
def db_with_desc_cat(db_init):
    s = create_session()
    trxs = []
    for labeled_trx in LABELED_TRANSACTIONS:
        print(f"labeled_trx = {labeled_trx}")
        category = Category(category=str(labeled_trx['category']), scheme="from_test")

        kwargs = {k: v for k, v in labeled_trx.items() if k not in ["category", "accept_category"]}
        trx = Transaction(**kwargs)

        trx.categories_suggested.append(category)
        if labeled_trx.get("accept_category", None):
            trx.category = category

        trxs.append(trx)
    s.add_all(trxs)
    s.commit()
示例#7
0
def get_category_by_id(id: int) -> Category:
    """
    Returns the category with a given id

    Args:
        id:

    Returns:
        Category
    """
    s = create_session()
    q = s.query(Category).filter(Category.id == id)
    categories = q.all()
    if len(categories) == 1:
        return categories[0]
    else:
        raise ValueError(f"Could not find category with id={id}")
示例#8
0
def test_lookup_classifier(db_with_unclassified, create_label_file):
    print(f"db_with_unclassified = {db_with_unclassified}")
    print(f"create_label_file = {create_label_file}")

    # Check before just to make sure we have 3/4 uncategorized
    s = create_session()
    trxs = get_transactions_without_category()
    assert len(trxs) == 3

    classify_db_by_lookup(create_label_file)

    trxs = get_transactions_without_category()
    assert len(trxs) == 1

    # Spot check we didnt overwrite
    all_trxs = s.query(Transaction).all()
    # TODO: Update this to use category table
    assert all_trxs[0].category == "something"
示例#9
0
def accept_current_chosen_categories(scheme="accepted"):
    """
    Moves all categories used by Transaction.category to the "accepted" scheme, removing any stale "accepted" categories

    Stale "accepted" categories are any categories in the accepted scheme now that are no longer attached to a
    Transaction.category

    Args:
        scheme (str): Scheme name for the "accepted" scheme

    Side Effects:
        Removes all categories in scheme that are no longer accepted
        Modifies all categories that are accepted to now be in the accepted scheme

    Returns:
        None
    """
    accepted_categories = get_accepted_categories()
    print(f"len(accepted_categories) = {len(accepted_categories)}")

    # Update anything that is accepted to the accepted scheme
    s = create_session()
    for c in accepted_categories:
        c.scheme = scheme
        s.merge(c)
        # s.add(c)
    s.commit()
    print("updated accepted schemes")

    # Remove anything that is in the accepted scheme but is no longer accepted
    stale_accepted = (s.query(Category)
                       .filter(Category.scheme == scheme)
                       .filter(Category.id.notin_(s.query(Transaction.category_id)))
                       .all())
    print(f"len(stale_accepted) = {len(stale_accepted)}")
    for c in stale_accepted:
        s.delete(c)
    s.commit()
    print("Deleted stale")
示例#10
0
def get_transactions(return_type='list', lazy=False, filters=tuple()) -> List[Transaction]:
    """
    Returns all transactions as specified type

    # TODO: Consolidate other get's to this one?

    Args:
        return_type (str): One of:
                            list: returns as [Transaction]
                            df: returns as pd.DataFrame with one row per transaction and all attributes as columns
        lazy (bool): If True, lazily load transactions (thus information about linked categories will not be available).
                     If False, eagerly load the category objects as well using joinedload
        filters: Iterable of arguments to pass to query.filter, such as (transaction.category.is_(None),)

    Returns:
        See return_type
    """
    s = create_session()
    q = s.query(Transaction)
    for f in filters:
        q = q.filter(f)
    if not lazy:
        q = q.options(
            joinedload(Transaction.category),
            joinedload(Transaction.categories_suggested),
        )

    trxs = q.all()
    s.close()

    # Should test this first, but lazy...
    if return_type == 'list':
        pass
    elif return_type == 'df':
        trxs = transactions_to_dataframe(trxs)
    else:
        raise ValueError(f"Invalid return_type {return_type}")
    return trxs
示例#11
0
def add_transactions_from_dataframe(df: pd.DataFrame, accept_category: bool = False):
    """
    Puts rows of df to the transactions db, with any categories added as suggested categories in the category table

    Optionally, can accept the categories and attach them to transactions as the selected category.  Otherwise, accepted
    category is left blank

    TODO: This is directly tied to the transaction extractor (implicitly linked by assuming the df naming conventions),
          should this just be a method on that class?  Or, I should move the nomenclature definition somewhere central

    Args:
        df (pd.DataFrame): Pandas dataframe with rows of transactions
        accept_category (bool): If True, any categories in the DataFrame will also be "accepted" on the committed
                                transactions

    Returns:
        None
    """
    transactions = dataframe_to_transactions(df, accept_category)

    s = create_session()
    s.add_all(transactions)
    s.commit()
    s.close()
示例#12
0
def get_transaction_by_id(transaction_id) -> Transaction:
    s = create_session()
    trx = s.query(Transaction).filter(Transaction.id == transaction_id).first()
    s.close()
    return trx
示例#13
0
def get_transactions_without_category() -> List[Transaction]:
    s = create_session()
    # transactions = s.query(Transaction).all()
    transactions = s.query(Transaction).filter(Transaction.category.is_(None)).all()
    s.close()
    return transactions
示例#14
0
def classify_by_model(scheme, clf, if_scheme_exists="replace", n_classifications_per_trx=3):
    """
    Classifies transactions in the DB, putting these classifications into the DB as suggested categories

    Args:
        scheme (str): Scheme name to use for suggested categories
        clf: classification model that has a .predict method for turning descriptions into categories
        if_scheme_exists (str): One of:
                                    replace: Removes all existing classifications with scheme==scheme and then commits
                                             the new classifications
                                    raise: Raises a ValueError if any transactions already have suggested categories
                                           with this scheme
                                    ignore: Does nothing (these transactions will then be "added" to the existing ones
                                            in the same scheme)
        n_classifications_per_trx (int): Maximum number suggested categories to create per transactions

    Side Effects:
        db suggested categories table is updated

    Returns:
        None
    """
    if if_scheme_exists == 'raise':
        if get_categories(scheme=scheme):
            raise ValueError("Scheme already in use")
        else:
            suggested_to_delete = []
    elif if_scheme_exists == 'ignore':
        suggested_to_delete = []
    elif if_scheme_exists == 'replace':
        # Get all suggested categories already in this scheme.  If the classification goes successfully, we will remove
        # these records
        # TODO: Delay this till after clf but before committing new categories?
        suggested_to_delete = [s.id for s in get_categories(scheme=scheme)]
    else:
        raise ValueError(f"Invalid value for if_scheme_exists '{if_scheme_exists}")

    trxs = get_transactions()

    df_trxs = transactions_to_dataframe(trxs)

    # TODO: Need to change the CommonUsageClassifier to use the sklearn standards...
    if isinstance(clf, CommonUsageClassifier):
        predictions = clf.predict(df_trxs['description'], n=n_classifications_per_trx)
    else:
        if n_classifications_per_trx != 1:
            raise NotImplementedError(f"n_classifications_per_trx for general models not yet implemented")
        predictions = clf.predict(df_trxs['description'])
        predictions = pd.DataFrame(predictions, index=df_trxs['description'])
    category_objs_by_trx = predictions_to_category_objs(predictions, scheme=scheme)

    for trx, this_category_list in zip(trxs, category_objs_by_trx):
        trx.categories_suggested.extend(this_category_list)

    s = create_session()
    # This requires a separate insert per category, but bulk_save_objects() doesn't work across relationships.  To speed
    # this up, I'd have to build my own logic for inserting things separately as bulk ops and then updating them
    # accordingly(?).  But thankfully the scale I need now is fine as is.
    s.add_all(trxs)
    s.commit()

    # Remove old categories that shouldn't be there anymore
    if suggested_to_delete:
        # Delete, avoiding the ORM because it'll do each delete separately!
        delete_q = Category.__table__.delete().where(Category.id.in_(suggested_to_delete))
        s.execute(delete_q)
        s.commit()
示例#15
0
def get_accepted_categories():
    s = create_session()
    accepted_categories = s.query(Category).filter(Category.id.in_(s.query(Transaction.category_id))).all()
    s.close()
    return accepted_categories