예제 #1
0
    def classify_url(self, domain, page, depth=0):
        """
        Classify the documents after crawling them.

        args:
            domain - the domain part of the url
            page - the other part of the url
            depth - how deep to crawl

        returns:
            a list of predicted probabilities for each instance belonging to
            each class
        """
        # get the documents
        documents, _ = crawl_page(domain, page, depth=0)

        # parse the documents
        string_data = []
        for page, doc in documents.iteritems():
            words = parse_html_simple(doc)
            parsed = []
            for word in words:
                if (word in self.english_words
                        and word not in self.stop_words
                        and word in self.vocabulary):
                    parsed.append(word)
            string_data.append(' '.join(parsed))

        count_data = self.vectorizer.transform(string_data)

        # classify the documents
        probs = self.classifier.predict_proba(count_data)
        return probs
예제 #2
0
def call_scraper(args):
    domain = args[0]
    link = args[1]

    return crawl_page(domain, link, href_match='/wiki/', depth=1)