Пример #1
0
 def test_load_training_data_use_existing(self):
     logger.info(
         "Running Test Case: {id}".format(id=self.id().split('.')[-1]))
     existing_entries = self.test_data
     actual = ml.load_training_data(training_data=None,
                                    existing_entries=existing_entries)
     self.assertEqual(existing_entries, actual)
Пример #2
0
    def enhance_transactions(self):  # load training data
        self.training_data = ml.load_training_data(
            self.training_data,
            filter_training_data_by_account=self.
            filter_training_data_by_account,
            existing_entries=self.existing_entries)

        # convert training data to a list of TxnPostingAccounts
        self.converted_training_data = [
            ml.TxnPostingAccount(t, p, pRef.account)
            for t in self.training_data for pRef in t.postings
            for p in t.postings if p.account != pRef.account
        ]

        # train the machine learning model
        self._trained = False
        if not self.converted_training_data:
            logger.warning("Cannot train the machine learning model "
                           "because the training data is empty.")
        elif len(self.converted_training_data) < 2:
            logger.warning(
                "Cannot train the machine learning model "
                "because the training data consists of less than two elements."
            )
        else:
            transformers = []
            transformer_weights = {}
            transformers.append(
                ('narration',
                 Pipeline([
                     ('getNarration', ml.GetNarration()),
                     ('vect', CountVectorizer(ngram_range=(1, 3))),
                 ])))
            transformer_weights['narration'] = 0.8
            transformers.append(
                ('account',
                 Pipeline([
                     ('getReferencePostingAccount',
                      ml.GetReferencePostingAccount()),
                     ('vect', CountVectorizer(ngram_range=(1, 3))),
                 ])))
            transformer_weights['account'] = 0.8

            distinctPayees = set(
                map(lambda trx: trx.txn.payee, self.converted_training_data))
            if len(distinctPayees) > 1:
                transformers.append(
                    ('payee',
                     Pipeline([
                         ('getPayee', ml.GetPayee()),
                         ('vect', CountVectorizer(ngram_range=(1, 3))),
                     ])))
                transformer_weights['payee'] = 0.5

            transformers.append((
                'dayOfMonth',
                Pipeline([
                    ('getDayOfMonth', ml.GetDayOfMonth()),
                    ('caster',
                     ml.ArrayCaster()),  # need for issue with data shape
                ])))
            transformer_weights['dayOfMonth'] = 0.1

            self.pipeline = Pipeline([
                ('union',
                 FeatureUnion(transformer_list=transformers,
                              transformer_weights=transformer_weights)),
                ('svc', SVC(kernel='linear')),
            ])
            logger.debug("About to train the machine learning model...")
            self.pipeline.fit(
                self.converted_training_data,
                ml.GetPostingAccount().transform(self.converted_training_data))
            logger.info("Finished training the machine learning model.")
            self._trained = True

        if not self._trained:
            logger.warning(
                "Cannot generate predictions or suggestions "
                "because there is no trained machine learning model.")
            return self.imported_transactions

        # predict missing second postings
        self.transactions = self.imported_transactions
        if self.predict_second_posting:
            logger.debug(
                "About to generate predictions for missing second postings...")
            predicted_accounts: List[str]
            predicted_accounts = self.pipeline.predict(
                self.imported_transactions)
            self.transactions = [
                ml.add_posting_to_transaction(*t_a)
                for t_a in zip(self.transactions, predicted_accounts)
            ]
            logger.debug(
                "Finished adding predicted accounts to the transactions to be imported."
            )

        # suggest accounts that are likely involved in the transaction
        if self.suggest_accounts:
            # get values from the SVC decision function
            logger.debug(
                "About to generate suggestions about related accounts...")
            decision_values = self.pipeline.decision_function(
                self.imported_transactions)

            # add a human-readable class label (i.e., account name) to each value, and sort by value:
            suggestions = [[
                account for _, account in sorted(list(
                    zip(distance_values, self.pipeline.classes_)),
                                                 key=lambda x: x[0],
                                                 reverse=True)
            ] for distance_values in decision_values]

            # add the suggested accounts to each transaction:
            self.transactions = [
                ml.add_suggested_accounts_to_transaction(*t_s)
                for t_s in zip(self.transactions, suggestions)
            ]
            logger.debug(
                "Finished adding suggested accounts to the transactions to be imported."
            )

        return self.transactions
Пример #3
0
    def enhance_transactions(self):  # load training data
        self.training_data = ml.load_training_data(
            self.training_data,
            filter_training_data_by_account=self.
            filter_training_data_by_account,
            existing_entries=self.existing_entries)

        # train the machine learning model
        self._trained = False
        if not self.training_data:
            logger.warning("Cannot train the machine learning model "
                           "because the training data is empty.")
        elif len(self.training_data) < 2:
            logger.warning(
                "Cannot train the machine learning model "
                "because the training data consists of less than two elements."
            )
        else:
            self.pipeline = Pipeline([
                (
                    'union',
                    FeatureUnion(
                        transformer_list=[
                            ('narration',
                             Pipeline([
                                 ('getNarration', ml.GetNarration()),
                                 ('vect', CountVectorizer(ngram_range=(1, 3))),
                             ])),
                            (
                                'payee',
                                Pipeline([  # any existing payee, if one exists
                                    ('getPayee', ml.GetPayee()),
                                    ('vect', CountVectorizer(ngram_range=(1,
                                                                          3))),
                                ])),
                            (
                                'dayOfMonth',
                                Pipeline([
                                    ('getDayOfMonth', ml.GetDayOfMonth()),
                                    ('caster', ml.ArrayCaster()
                                     ),  # need for issue with data shape
                                ])),
                        ],
                        transformer_weights={
                            'narration': 0.8,
                            'payee': 0.5,
                            'dayOfMonth': 0.1
                        })),
                ('svc', SVC(kernel='linear')),
            ])
            logger.debug("About to train the machine learning model...")
            self.pipeline.fit(self.training_data,
                              ml.GetPayee().transform(self.training_data))
            logger.info("Finished training the machine learning model.")
            self._trained = True

        if not self._trained:
            logger.warning(
                "Cannot generate predictions or suggestions "
                "because there is no trained machine learning model.")
            return self.imported_transactions

        # predict payees
        self.transactions = self.imported_transactions
        if self.predict_payees:
            logger.debug("About to generate predictions for payees...")
            predicted_payees: List[str]
            predicted_payees = self.pipeline.predict(self.transactions)
            self.transactions = [
                ml.add_payee_to_transaction(
                    *t_p, overwrite=self.overwrite_existing_payees)
                for t_p in zip(self.transactions, predicted_payees)
            ]
            logger.debug(
                "Finished adding predicted payees to the transactions to be imported."
            )
        # predict payees
        self.transactions = self.imported_transactions
        if self.predict_payees:
            logger.debug("About to generate predictions for payees...")
            predicted_payees: List[str]
            predicted_payees = self.pipeline.predict(
                self.imported_transactions)
            self.transactions = [
                ml.add_payee_to_transaction(
                    *t_p, overwrite=self.overwrite_existing_payees)
                for t_p in zip(self.imported_transactions, predicted_payees)
            ]
            logger.debug(
                "Finished adding predicted payees to the transactions to be imported."
            )

        # suggest likely payees
        if self.suggest_payees:
            # get values from the SVC decision function
            logger.debug(
                "About to generate suggestions about likely payees...")
            decision_values = self.pipeline.decision_function(
                self.imported_transactions)

            # add a human-readable class label (i.e., payee's name) to each value, and sort by value:
            suggested_payees = [[
                payee for _, payee in sorted(list(
                    zip(distance_values, self.pipeline.classes_)),
                                             key=lambda x: x[0],
                                             reverse=True)
            ] for distance_values in decision_values]

            # add the suggested payees to each transaction:
            self.transactions = [
                ml.add_suggested_payees_to_transaction(*t_p)
                for t_p in zip(self.transactions, suggested_payees)
            ]
            logger.debug(
                "Finished adding suggested payees to the transactions to be imported."
            )

        return self.transactions
Пример #4
0
 def test_load_training_data(self):
     logger.info(
         "Running Test Case: {id}".format(id=self.id().split('.')[-1]))
     test_data = ml.load_training_data(training_data=os.path.join(
         os.path.dirname(__file__), 'sample_training.beancount'))
     self.assertEqual(1, len(list(test_data)))