Exemplo n.º 1
0
    def evaluate_statistical(self):
        """Evaluates the system using 10-fold cross validation, returning
        a dictionary containing the number of correct results per-fold in
        each class."""
        trainer = Trainer(self.profiles, self.profile_type,
                          self.converter, self.network)
        training_set = trainer.generate_training_set()

        profiles = numpy.array(list(self.profiles))
        data = numpy.array(training_set.data)
        labels = numpy.array(training_set.labels)

        fold_iterator = cross_validation.StratifiedKFold(labels,
                                                         n_folds=10,
                                                         shuffle=True,
                                                         random_state=42)

        official_profile_pairs = ((x['name'], self.profile_type(x['profile'],
                                                                x['posts']))
                                  for x in self.profiles if x['label'] == 2)
        affiliate_profile_pairs = ((x['name'], self.profile_type(x['profile'],
                                                                 x['posts']))
                                   for x in self.profiles if x['label'] == 1)

        official_profiles = defaultdict(list)
        for name, profile in official_profile_pairs:
            official_profiles[name].append(profile)

        affiliate_profiles = defaultdict(list)
        for name, profile in affiliate_profile_pairs:
            affiliate_profiles[name].append(profile)

        fold = 1
        # This assumes we're just using Random Forest (i.e. one classifier)
        # Ugly hack for now.
        classification_results = {
            'official_correct': [],
            'affiliate_correct': []
        }
        for train, test in fold_iterator:
            classifiers = initialize_classifiers()

            training_data = data[train]
            training_labels = labels[train]

            test_set = itertools.compress(profiles[test], labels[test])
            company_names = set(x['name'] for x in test_set)
            print 'Test set', fold, '-', len(company_names), 'companies.'

            for classifier in classifiers:
                classifier_name = classifier['type']
                c = classifier['classifier']
                trained = c.fit(training_data, training_labels)

                system = SingleNetworkSearcher(
                    classifier=trained,
                    searchengine=self.search_engine,
                    profile_converter=self.converter,
                    network=self.network)

                number_of_workers = int(multiprocessing.cpu_count() * 0.75)
                worker_pool = ProcessingPool(number_of_workers)
                all_results = worker_pool.map(system.query, company_names)

                for idx, name in enumerate(company_names):
                    official_results = official_profiles[name]
                    affiliate_results = affiliate_profiles[name]

                    results = all_results[idx]
                    classified_official = results.official
                    classified_affiliate = results.affiliate

                    marked_official_handles = [x['profile'].handle.lower()
                                               for x in classified_official]
                    marked_affiliate_handles = [x['profile'].handle.lower()
                                                for x in classified_affiliate]

                    official_handles = [x.handle.lower()
                                        for x in official_results]
                    affiliate_handles = [x.handle.lower()
                                         for x in affiliate_results]

                    official_correct = 0
                    for handle in marked_official_handles:
                        if handle in official_handles:
                            official_correct += 1

                    affiliate_correct = 0
                    for handle in marked_affiliate_handles:
                        if handle in affiliate_handles:
                            affiliate_correct += 1

                    classification_results['official_correct'].append(official_correct)
                    classification_results['affiliate_correct'].append(affiliate_correct)

            fold += 1

        return classification_results
Exemplo n.º 2
0
    def evaluate(self):
        """Evaluates the system using 10-fold cross validation, returning
        a dictionary of results keyed by classifier type."""
        trainer = Trainer(self.profiles, self.profile_type,
                          self.converter, self.network)
        training_set = trainer.generate_training_set()

        profiles = numpy.array(list(self.profiles))
        data = numpy.array(training_set.data)
        labels = numpy.array(training_set.labels)

        fold_iterator = cross_validation.StratifiedKFold(labels,
                                                         n_folds=10,
                                                         shuffle=True,
                                                         random_state=42)

        official_profile_pairs = ((x['name'], self.profile_type(x['profile'],
                                                                x['posts']))
                                  for x in self.profiles if x['label'] == 2)
        affiliate_profile_pairs = ((x['name'], self.profile_type(x['profile'],
                                                                 x['posts']))
                                   for x in self.profiles if x['label'] == 1)

        official_profiles = defaultdict(list)
        for name, profile in official_profile_pairs:
            official_profiles[name].append(profile)

        affiliate_profiles = defaultdict(list)
        for name, profile in affiliate_profile_pairs:
            affiliate_profiles[name].append(profile)

        classification_results = defaultdict(list)
        fold = 1
        for train, test in fold_iterator:
            classifiers = initialize_classifiers()

            training_data = data[train]
            training_labels = labels[train]

            test_set = itertools.compress(profiles[test], labels[test])
            company_names = set(x['name'] for x in test_set)
            print 'Test set', fold, '-', len(company_names), 'companies.'

            for classifier in classifiers:
                classifier_name = classifier['type']
                c = classifier['classifier']
                trained = c.fit(training_data, training_labels)

                system = SingleNetworkSearcher(
                    classifier=trained,
                    searchengine=self.search_engine,
                    profile_converter=self.converter,
                    network=self.network)

                number_of_workers = int(multiprocessing.cpu_count() * 0.75)
                worker_pool = ProcessingPool(number_of_workers)
                all_results = worker_pool.map(system.query, company_names)

                combined_official_results = []
                combined_affiliate_results = []
                for idx, name in enumerate(company_names):
                    official_results = official_profiles[name]
                    affiliate_results = affiliate_profiles[name]

                    results = all_results[idx]
                    classified_official = results.official
                    classified_affiliate = results.affiliate
                    classified_unrelated = results.unrelated

                    marked_official_handles = [x['profile'].handle.lower()
                                               for x in classified_official]
                    marked_affiliate_handles = [x['profile'].handle.lower()
                                                for x in classified_affiliate]
                    marked_unrelated_handles = [x['profile'].handle.lower()
                                                for x in classified_unrelated]
                    official_handles = [x.handle.lower()
                                        for x in official_results]
                    affiliate_handles = [x.handle.lower()
                                         for x in affiliate_results]

                    official_counts = MetricCalculator.count_positives(
                        actual_handles=official_handles,
                        marked_positive_handles=marked_official_handles,
                        marked_negative_handles=(marked_affiliate_handles
                                                 + marked_unrelated_handles))
                    combined_official_results.append(official_counts)

                    affiliate_counts = MetricCalculator.count_positives(
                        actual_handles=affiliate_handles,
                        marked_positive_handles=marked_affiliate_handles,
                        marked_negative_handles=(marked_unrelated_handles
                                                 + marked_official_handles))
                    combined_affiliate_results.append(affiliate_counts)

                official_metrics = MetricCalculator.fold_metrics(
                    combined_official_results)
                affiliate_metrics = MetricCalculator.fold_metrics(
                    combined_affiliate_results)

                result = {
                    'official': official_metrics,
                    'affiliate': affiliate_metrics
                }
                classification_results[classifier_name].append(result)

            fold += 1

        return classification_results