Пример #1
0
    def __init__(self, train=False, dataset=[]):
        """
            SimilarFinder is used to obtain the most similar cases to a given
            sample. It uses Ski-kit learn's nearest neighbor implementation
            param: train: will train the model
            param: dataset: numpy array of precedent vectors. Ignored if train
                            is set to False
        """
        if not train:
            self.model = Load.load_binary("similarity_model.bin")
            self.case_numbers = Load.load_binary("similarity_case_numbers.bin")
            self.scaler = Load.load_binary("similarity_scaler.bin")
        elif len(dataset) > 0:
            sample_set = [
                np.concatenate([vec['facts_vector'], vec['outcomes_vector']])
                for vec in dataset
            ]

            for i in range(len(sample_set)):
                sample_set[i] = sample_set[i].astype(np.int64)

            self.scaler = StandardScaler()
            sample_set = self.scaler.fit_transform(sample_set)
            self.model = NearestNeighbors(5, metric='euclidean')
            self.model.fit(sample_set)
            self.case_numbers = [vec['name'] for vec in dataset]
            save = Save()
            save.save_binary("similarity_model.bin", self.model)
            save.save_binary("similarity_case_numbers.bin", self.case_numbers)
            save.save_binary("similarity_scaler.bin", self.scaler)
        else:
            raise ValueError('Please train or load the classifier first')
Пример #2
0
    def weights_to_csv(self):
        """
        Writes all the weights to .csv format
        1) get the facts
        2) for every outcome write the weights
        :return: None
        """
        try:
            if self.model is None:
                self.model = Load.load_binary('multi_class_svm_model.bin')
                self.classifier_labels = Load.load_binary('classifier_labels.bin')
        except BaseException:
            return None

        index = TagPrecedents().get_intent_index()
        fact_header = [" "]
        for header in index['facts_vector']:
            fact_header.append(header[1])

        with open('weights.csv', 'w') as outcsv:
            writer = csv.writer(outcsv)
            writer.writerow(fact_header)

            for i in range(len(self.model.estimators_)):
                outcome_list = [self.classifier_labels[i]]
                estimator = self.model.estimators_[i]
                try:
                    weights = estimator.coef_[0]
                    for j in range(len(weights)):
                        outcome_list.append(weights[j])
                    writer.writerow(outcome_list)
                except AttributeError:
                    pass
        Log.write('Weights saved to .csv')
Пример #3
0
    def test_binarize_model(self):
        binary_directory = Path.binary_directory
        Path.binary_directory = Path.test_mock_precedent_directory

        s = Save()
        model = {'key': 'value'}
        s.save_binary('test_model.bin', model)
        l = Load()
        new_model = l.load_binary('test_model.bin')
        self.assertEqual(new_model['key'], 'value')
        os.remove(Path.binary_directory + 'test_model.bin')
        Path.binary_directory = binary_directory
Пример #4
0
 def load(self):
     """
         Loads the regressors different components
     """
     regressor_name = self.regressor_name
     Log.write("Loading " + '{}_regressor.bin'.format(regressor_name))
     file_path = os.path.join(Path.binary_directory, '{}_regressor.bin'.format(regressor_name))
     Log.write('{}_regressor.bin'.format(regressor_name) + " is successfully loaded")
     regressor = load_model(file_path)
     scaler = Load.load_binary('{}_scaler.bin'.format(regressor_name))
     self.model = AbstractRegressor._create_pipeline(scaler, regressor)
     self.mean_facts_vector = Load.load_binary('model_metrics.bin')['regressor'][regressor_name]['mean_facts_vector']
Пример #5
0
    def get_ml_statistics():
        """
        Remove unecessary data for the endpoint.

        :return:
        {
            'data_set':{
                'size': 5000
            },
            'regressor':{
                'regressor name':{
                    'std': 4,
                    'mean': 5,
                    'variance': 42,
                    'mean_fact_vector': [3, 1, 5, 6, 2]
                }
            },
            'classifier':{
                'classifier name':{
                    'prediction_accuracy': 0.92,
                }
            }
        }
        """
        stat_dict = Load.load_binary('model_metrics.bin')
        for regressor_type in stat_dict['regressor']:
            stat_dict['regressor'][regressor_type].pop('mean_facts_vector',
                                                       None)
        return stat_dict
Пример #6
0
def __dictionary_to_list():
    """

    Converts the binarize structured_data_dict to a list format

    precedent_vectors:{
        filename:{
            name: 'AZ-XXXXXXX.txt',
            demands_vector: [...],
            facts_vector: [...],
            outcomes_vector: [...]
        }
    }

    :return: data_list: [{
        name: 'AZ-XXXXXXX.txt',
        demands_vector: [...],
        facts_vector: [...],
        outcomes_vector: [...]
    },
    {
        ...
    }]
    """
    precedent_vector = Load.load_binary("precedent_vectors.bin")
    if precedent_vector is None:
        return []
    Log.write("Formatting data")
    data_list = []
    for precedent_file in precedent_vector:
        data_list.append(precedent_vector[precedent_file])
    return data_list
Пример #7
0
    def data_metrics(self):
        """
        1) Obtain the fact vectors
        2) Obtain the outcome vectors pertaining to the regressor in question
        3) Collect data metrics
            3.1) mean_fact_vector --> the average of every fact column
            3.2) standard deviation of outcomes
            3.3) variance of outcomes
            3.4) mean of outcomes
        4) persist data into a dictionary which will be binarized

        model_metrics -->
        {
            'data_set':{
                'size': 5000
            },
            'regressor':{
                'regressor name':{
                    'std': 4,
                    'mean': 5,
                    'variance': 42,
                    'mean_fact_vector': [3, 1, 5, 6, 2]
                }
            },
            'classifier':{
                'classifier name':{
                    'prediction_accuracy': 0.92,
                }
            }
        }
        :return: model_metrics
        """

        facts_vector = [x['facts_vector'] for x in self.dataset]
        outcomes_vector = [x['outcomes_vector'][self.outcome_index] for x in self.dataset]

        model_metrics = Load.load_binary('model_metrics.bin')
        if model_metrics is None:
            model_metrics = {
                'regressor': {
                    self.regressor_name: {

                    }
                }
            }
        elif 'regressor' not in model_metrics:
            model_metrics['regressor'] = {}

        self.mean_facts_vector = np.mean(facts_vector, axis=0)
        model_metrics['regressor'][self.regressor_name] = {
            'mean_facts_vector': self.mean_facts_vector,
            'std': np.std(outcomes_vector),
            'variance': np.var(outcomes_vector),
            'mean': np.mean(outcomes_vector)
        }

        return model_metrics
Пример #8
0
    def load_classifier_labels():
        """
        The prediction given by the model gives a matrix with less dimensions
        then the total outcomes. The reason being that only boolean outcomes
        are kept in the prediction. We therefore have to relabel the columns.

        :return: Dict of classifier labels
            dict:{
                "column 1": <int>,
                "column 2": <int>,
                ...
             }
        """
        return Load.load_binary('classifier_labels.bin')
Пример #9
0
    def predict(self, data):
        """
        1) Predicts an outcome given facts
        2) Predicts probability that prediction is correct
            2.1) Range goes from [0-1] where x < 0.5 is False
            2.2) The model only returns the probability that a fact is 1
            2.3) therefore to predict that the probability that a fact is 0 we do
                 1 - x when x < 0.5

        :param data: numpy([1, 0, 0, ...])
        :return: np.array([...])
        """
        if self.model is None:
            self.model = Load.load_binary("multi_class_svm_model.bin")
        data = binarize([data], threshold=0)
        probabilities = self.model.predict_proba(data)[0]
        predictions = self.model.predict(data)
        for i in range(len(probabilities)):
            prediction = predictions[0][i]
            if prediction == 0:
                probabilities[i] = 1 - probabilities[i]
            probabilities[i] = format(probabilities[i], '.2f')
        return self.model.predict(data), probabilities
Пример #10
0
    def get_ordered_weights(self):
        """
        Sort all the facts by importance for every outcome

        1) If the classifier model isn't loaded then load it
        2) Load labels of the outcomes
        3) obtain labels of every fact
        4) for every estimator append all it's fact weights
        5) sort the fact in descending order by weight
        6) do not append facts with weight of 0
        7) threshold facts by using the logarithmic power of the mean
           7.1) any number with greater or equal power of magnitude is important
           7.2) other numbers make a fact unimportant

        ** Custom list for additional_indemnity_money
        :return: {
                    'additional_indemnity_money': {
                        'important_facts': [
                            'asker_is_landlord',
                            'tenant_rent_not_paid_more_3_weeks',
                            'tenant_owes_rent',
                            'tenant_left_without_paying',
                            'not_violent'
                        ],
                        'additional_facts': [
                            ...
                        ]
                    }
                }
        """
        if self.model is None:
            self.model = Load.load_binary('multi_class_svm_model.bin')
            self.classifier_labels = Load.load_binary('classifier_labels.bin')
            self.label_column_index = TagPrecedents().get_intent_index()
        weight_dict = {}

        for i in range(len(self.model.estimators_)):
            outcome_list = []
            estimator = self.model.estimators_[i]
            try:
                weights = estimator.coef_[0]
                for j in range(len(weights)):
                    if weights[j] > 0:
                        outcome_list.append([self.label_column_index['facts_vector'][j][1], weights[j]])

                outcome_list.sort(key=lambda x: abs(x[1]), reverse=True)
                weights = [abs(x[1]) for x in outcome_list]
                mean_power = math.log10(np.mean(np.array(weights)))
                important_facts = [x[0] for x in outcome_list if math.log10(abs(x[1])) >= mean_power]
                additional_facts = [x[0] for x in outcome_list if math.log10(abs(x[1])) < mean_power]
                if self.classifier_labels[i][0] == 'additional_indemnity_money':
                    important_facts.append('tenant_monthly_payment')
                    important_facts.append('tenant_not_paid_lease_timespan')
                    if 'tenant_not_paid_lease_timespan' in additional_facts:
                        additional_facts.remove('tenant_not_paid_lease_timespan')
                    if 'tenant_monthly_payment' in additional_facts:
                        additional_facts.remove('tenant_monthly_payment')
                weight_dict[self.classifier_labels[i][0]] = {}
                weight_dict[self.classifier_labels[i][0]]['important_facts'] = important_facts
                weight_dict[self.classifier_labels[i][0]]['additional_facts'] = additional_facts
            except AttributeError:
                print('Problem with {} prediction'.format(self.classifier_labels[i][0]))
        return weight_dict
Пример #11
0
    def __test(self, x_test, y_test):
        """
        1) Tests model
        2) Save the accuracy to the model metrics binary

        model_metrics -->
        {
            'data_set':{
                'size': 5000
            },
            'regressor':{
                'regressor name':{
                    'std': 4,
                    'mean': 5,
                    'variance': 42,
                    'mean_fact_vector': [3, 1, 5, 6, 2]
                }
            },
            'classifier':{
                'classifier name':{
                    'prediction_accuracy': 0.92,
                }
            }
        }

        :param x_test: numpy array
        :param y_test: numpy array
        :return: None
        """
        model_metrics = Load.load_binary('model_metrics.bin')
        if model_metrics is None:
            model_metrics = {
                'classifier': {}
            }
        elif 'classifier' not in model_metrics:
            model_metrics['classifier'] = {}

        model_metrics['data_set'] = {
            'size': len(self.data_set)
        }

        index = TagPrecedents().get_intent_index()['outcomes_vector']
        Log.write("Testing Classifier")
        y_predict = self.model.predict(x_test)
        Log.write("Classifier results:\n")
        for i in range(len(y_predict[0])):
            yp = y_predict[:, [i]]
            yt = y_test[:, [i]]
            accuracy = np.sum(yp == yt) * 100.0 / len(yt)
            column_name = index[self.mlb.classes_[i]][1]
            (precision, recall, f1, _) = precision_recall_fscore_support(yt, yp)
            Log.write('Column: {}'.format(column_name))
            Log.write('Test accuracy: {}%'.format(accuracy))
            Log.write('Precision: {}'.format(precision))
            Log.write('Recall: {}'.format(recall))
            Log.write('F1: {}\n'.format(f1))

            model_metrics['classifier'][column_name] = {
                'prediction_accuracy': accuracy,
            }
        Save().save_binary('model_metrics.bin', model_metrics)
Пример #12
0
class MlController:
    indexes = TagPrecedents().get_intent_index()
    classifier_labels = MultiClassSVM.load_classifier_labels()
    classifier_model = MultiClassSVM()
    regression_model = MultiOutputRegression()
    similar_finder = SimilarFinder()
    precedent_vectors = Load.load_binary("precedent_vectors.bin")

    @staticmethod
    def predict_outcome(input_json):
        """
        Makes a prediction based on the input json
        input_json: Dict containing the facts and demands
        The input json must be as follows:
            {
                "facts" : {
                    "fact1": 1 or 0,
                    "fact2": 1 or 0,
                    "fact3": 1 or 0,
                    etc
                }
            }

        It is not necessary to include ALL demands or facts,
        some may be omitted
        returns: a dict containing all the predictions
                 currently, its format is as follows:
                 {
                     "lease_resiliation" : 1 or 0
                 }
        """
        facts_vector = MlController.fact_dict_to_vector(input_json['facts'])
        outcome_vector, probabilities = MlController.classifier_model.predict(
            facts_vector)
        outcome_vector = outcome_vector[0]
        outcome_vector = MlController.regression_model.predict(
            facts_vector, outcome_vector)
        response = MlController.outcome_vector_to_dict(outcome_vector)
        response[
            'probabilities_vector'] = MlController.probability_vector_to_dict(
                probabilities)
        similar_dict = {
            'facts_vector': facts_vector,
            'outcomes_vector': outcome_vector
        }
        response[
            'similar_precedents'] = MlController.format_similar_precedents(
                MlController.similar_finder.get_most_similar(similar_dict))
        return response

    @staticmethod
    def fact_dict_to_vector(input_dict):
        """
        Converts a dictionary to vector form, readable by ML
        input_dict: dictionary containing all facts or demands
                    It is as follows:
                    {
                        "fact 1": <int>,
                        "fact 2": <int>,
                        "fact 3": <int>,
                        ...
                    }
        returns: a vector integers
        """
        output_vector = np.zeros(len(MlController.indexes['facts_vector']))
        for index, val, data_type in MlController.indexes['facts_vector']:
            if val in input_dict:
                output_vector[index] = int(input_dict[val])
        return output_vector

    @staticmethod
    def outcome_vector_to_dict(outcome_vector):
        return_dict = {}
        for outcome_index in MlController.classifier_labels:
            label = MlController.classifier_labels[outcome_index][0]
            return_dict[label] = str(outcome_vector[outcome_index])
        return {'outcomes_vector': return_dict}

    @staticmethod
    def probability_vector_to_dict(probabilities_vector):
        return_dict = {}
        for outcome_index in MlController.classifier_labels:
            label = MlController.classifier_labels[outcome_index][0]
            return_dict[label] = str(probabilities_vector[outcome_index])
        return return_dict

    @staticmethod
    def fact_vector_to_dict(fact_vector):
        return_dict = {}
        for fact_tuple in MlController.indexes['facts_vector']:
            label = fact_tuple[1]
            return_dict[label] = str(fact_vector[fact_tuple[0]])
        return {'facts': return_dict}

    @staticmethod
    def format_similar_precedents(similarity_list):
        """
        Formats a list such as ["AZ-111111", 1.5] into a list of dicts of the form
        [{"precedent": "AZ-111111","distance": 1.5}]
        :param similarity_list: List of lists of the form ["PRECEDENT_NAME"(string), DISTANCE(number)]
        :return: A formatted list of precedents
        """
        formatted_precedents = []

        for precedent_array in similarity_list:
            precedent = {
                "precedent":
                MlController.precedent_vectors[precedent_array[0]]
                ['file_number'],
                "distance":
                precedent_array[1],
                "facts":
                MlController.fact_vector_to_dict(
                    MlController.precedent_vectors[
                        precedent_array[0]]['facts_vector'])['facts'],
                "outcomes":
                MlController.outcome_vector_to_dict(
                    MlController.precedent_vectors[precedent_array[0]]
                    ['outcomes_vector'])['outcomes_vector'],
            }
            for fact_tuple in MlController.indexes['facts_vector']:
                if fact_tuple[2] == 'bool':
                    precedent['facts'][fact_tuple[1]] = bool(
                        float(precedent['facts'][fact_tuple[1]]))
            for outcome_tuple in MlController.classifier_labels.values():
                if outcome_tuple[1] == 'bool':
                    precedent['outcomes'][outcome_tuple[0]] = bool(
                        float(precedent['outcomes'][outcome_tuple[0]]))
            formatted_precedents.append(precedent)
        return formatted_precedents

    @staticmethod
    def get_weighted_facts():
        """
        :return:
            {
            'additional_indemnity_money': {
                'important_facts': [
                    ...
                ],
                'additional_facts': [
                    ...
                ]
            }
        """
        return MlController.classifier_model.get_ordered_weights()

    @staticmethod
    def get_anti_facts():
        return {
            'tenant_individual_responsability': 'tenant_group_responsability',
            'tenant_lease_fixed': 'tenant_lease_indeterminate',
            'tenant_rent_not_paid_less_3_weeks':
            'tenant_rent_not_paid_more_3_weeks',
            'not_violent': 'violent'
        }

    @staticmethod
    def get_ml_statistics():
        """
        Remove unecessary data for the endpoint.

        :return:
        {
            'data_set':{
                'size': 5000
            },
            'regressor':{
                'regressor name':{
                    'std': 4,
                    'mean': 5,
                    'variance': 42,
                    'mean_fact_vector': [3, 1, 5, 6, 2]
                }
            },
            'classifier':{
                'classifier name':{
                    'prediction_accuracy': 0.92,
                }
            }
        }
        """
        stat_dict = Load.load_binary('model_metrics.bin')
        for regressor_type in stat_dict['regressor']:
            stat_dict['regressor'][regressor_type].pop('mean_facts_vector',
                                                       None)
        return stat_dict