def train(self, sampler=None): if sampler: X = [] y = [] urls = sampler.get_urls() for url in urls: # calculate features, prepare for training/classification candidates = self.get_candidates(url) # transform dataset features = [] authors = [] is_author_found = False for candidate in candidates: features.append(candidate.get_features()) # determine whether this is a true author or not is_target = sampler.is_author(url, candidate.el, candidate.text) is_author_found = is_author_found or is_target authors.append(1 if is_target else 0) if is_target: print 'Candidate found: %s' % candidate.text try: print candidate except UnicodeEncodeError: pass print if is_author_found: X.extend(features) y.extend(authors) data = (array(X), array(y)) self.save(data) else: data = self.restore() start = etime() self.classifier.fit(*data) end = etime() print 'Train classifier %s' % str(end - start)
def get_candidates(self, url): print print 'Working with: %s' % url start = etime() html = read_web(url) candidates = self.feature_extractor.get_candidates(html) end = etime() print 'Gather candidates time: %s' % str(end - start) start = etime() for candidate in candidates: candidate.calculate_features() end = etime() print 'Calculate features time: %s' % str(end - start) return candidates