def train(self, sampler=None):
        if sampler:
            X = []
            y = []
            urls = sampler.get_urls()

            for url in urls:
                # calculate features, prepare for training/classification
                candidates = self.get_candidates(url)

                # transform dataset
                features = []
                authors = []
                is_author_found = False

                for candidate in candidates:
                    features.append(candidate.get_features())
                    # determine whether this is a true author or not
                    is_target = sampler.is_author(url, candidate.el, candidate.text)
                    is_author_found = is_author_found or is_target
                    authors.append(1 if is_target else 0)

                    if is_target:
                        print 'Candidate found: %s' % candidate.text
                        try:
                            print candidate
                        except UnicodeEncodeError:
                            pass
                        print

                if is_author_found:
                    X.extend(features)
                    y.extend(authors)

            data = (array(X), array(y))
            self.save(data)
        else:
            data = self.restore()

        start = etime()
        self.classifier.fit(*data)
        end = etime()
        print 'Train classifier %s' % str(end - start)
    def get_candidates(self, url):
        print
        print 'Working with: %s' % url

        start = etime()

        html = read_web(url)
        candidates = self.feature_extractor.get_candidates(html)

        end = etime()
        print 'Gather candidates time: %s' % str(end - start)

        start = etime()

        for candidate in candidates:
            candidate.calculate_features()

        end = etime()
        print 'Calculate features time: %s' % str(end - start)

        return candidates