def classify_url(self, domain, page, depth=0): """ Classify the documents after crawling them. args: domain - the domain part of the url page - the other part of the url depth - how deep to crawl returns: a list of predicted probabilities for each instance belonging to each class """ # get the documents documents, _ = crawl_page(domain, page, depth=0) # parse the documents string_data = [] for page, doc in documents.iteritems(): words = parse_html_simple(doc) parsed = [] for word in words: if (word in self.english_words and word not in self.stop_words and word in self.vocabulary): parsed.append(word) string_data.append(' '.join(parsed)) count_data = self.vectorizer.transform(string_data) # classify the documents probs = self.classifier.predict_proba(count_data) return probs
def call_scraper(args): domain = args[0] link = args[1] return crawl_page(domain, link, href_match='/wiki/', depth=1)