示例#1
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        # TODO
        # article.publishDate = config.publishDateExtractor.extract(doc)
        # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.topNode)
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)

            article.topNode = extractor.postExtractionCleanup(article.topNode)
            article.cleanedArticleText = outputFormatter.getFormattedText(article)
            article.topNode.attrib['rel'] = 'topnode' # mark html element
        article.h1 = extractor.getH1(article)
        # cleanup tmp file
        self.releaseResources(article)

        return article
示例#2
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = URLHelper.getCleanedUrl(crawl_candidate.url)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article

        doc = self.get_document(parse_candidate.url, raw_html)

        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)

        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.top_node)
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(article.raw_doc, article.top_node)

            article.top_node = extractor.post_cleanup(article.top_node)
            article.cleaned_text = output_formatter.get_formatted_text(article)

        # cleanup tmp file
        self.relase_resources(article)

        return article
示例#3
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        # TODO
        # article.publishDate = config.publishDateExtractor.extract(doc)
        # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.topNode)
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(
                    article.rawDoc, article.topNode)

            article.topNode = extractor.postExtractionCleanup(article.topNode)
            article.cleanedArticleText = outputFormatter.getFormattedText(
                article)

        return article
示例#4
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)

        # if the user requested a full body response
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None and any(
            [self.config.enableImageFetching, self.config.enableBodyAnalysis]):
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(
                    article.rawDoc, article.topNode)

            if self.config.enableBodyAnalysis:
                article.topNode = extractor.postExtractionCleanup(
                    article.topNode)
                article.cleanedArticleText = outputFormatter.getFormattedText(
                    article.topNode)

        return article
示例#5
0
    def crawl(self, crawlCandidate):
        article = Article()
        
        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)
        
        if rawHtml is None:
            return article
        
        doc = self.getDocument(parseCandidate.url, rawHtml)
        
        
        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()
        
        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)

        # if the user requested a full body response
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None and any([self.config.enableImageFetching, self.config.enableBodyAnalysis]):
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)

            if self.config.enableBodyAnalysis:
                article.topNode = extractor.postExtractionCleanup(article.topNode)
                article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)

        return article
示例#6
0
 def get_parse_candidate(self, crawl_candidate):
     if crawl_candidate.raw_html:
         return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html)
     return URLHelper.get_parsing_candidate(crawl_candidate.url)
 def get_parse_candidate(self, crawl_candidate):
     if crawl_candidate.raw_html:
         return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                crawl_candidate.raw_html)
     return URLHelper.get_parsing_candidate(crawl_candidate.url)