Пример #1
0
def test_content_and_content_comments_extractor_blocks(html):
    """
    The content and content/comments extractor should return proper blocks
    """
    content = extract_content(html, as_blocks=True)
    content_comments = extract_comments(html, as_blocks=True)

    passed_content = False
    passed_content_comments = False
    for i in range(5):
        # actual_content, actual_content_comments = \
        #     content_and_content_comments_extractor.analyze(
        #         html, blocks=True)
        actual_content = extract_content(html, as_blocks=True)
        actual_content_comments = extract_comments(html, as_blocks=True)
        passed_content = ([blk.text for blk in actual_content
                           ] == [blk.text for blk in content])
        passed_content_comments = ([
            blk.text for blk in actual_content_comments
        ] == [blk.text for blk in content_comments])
        if passed_content and passed_content_comments:
            break

    assert passed_content
    assert passed_content_comments
Пример #2
0
 def test_content_and_content_comments_extractor_blocks(self):
     '''
     The content and content/comments extractor should return proper blocks
     '''
     content = extract_content(self._html, as_blocks=True)
     content_comments = extract_comments(self._html, as_blocks=True)
 
     passed_content = False
     passed_content_comments = False
     for i in range(5):
         # actual_content, actual_content_comments = \
         #     content_and_content_comments_extractor.analyze(
         #         self._html, blocks=True)
         actual_content = extract_content(self._html, as_blocks=True)
         actual_content_comments = extract_comments(self._html, as_blocks=True)
         passed_content = (
             [blk.text for blk in actual_content] ==
             [blk.text for blk in content]
             )
         passed_content_comments = (
             [blk.text for blk in actual_content_comments] ==
             [blk.text for blk in content_comments]
             )
         if passed_content and passed_content_comments:
             break
 
     self.assertTrue(passed_content)
     self.assertTrue(passed_content_comments)
Пример #3
0
def worker(payload):
    line, _, path, encoding, content, _ = payload

    if not is_supported_encoding(encoding):
        return UnknownEncodingError('Unknown encoding: "%s"' %
                                    encoding), line, None

    # Reading file
    if content is None:
        try:
            if path.endswith('.gz'):
                with open(path, 'rb') as f:
                    raw_html_bytes = gzip.decompress(f.read())

                raw_html = raw_html_bytes.decode(encoding, errors='replace')
            else:
                with codecs.open(path,
                                 'r',
                                 encoding=encoding,
                                 errors='replace') as f:
                    raw_html = f.read()
        except UnicodeDecodeError as e:
            return e, line, None
    else:
        raw_html = content

    # Attempting extraction
    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            content = extract_content(raw_html)
    except BaseException as e:
        return e, line, None

    return None, line, content
Пример #4
0
def test_content_and_content_comments_extractor(html):
    content = extract_content(html)
    content_comments = extract_comments(html)
    passed_content = False
    passed_content_comments = False
    for i in range(10):
        # actual_content, actual_content_comments = \
        #     extract_content_and_comments(html)
        actual_content = extract_content(html)
        actual_content_comments = extract_comments(html)
        passed_content = actual_content == content
        passed_content_comments = (actual_content_comments == content_comments)
        if passed_content and passed_content_comments:
            break

    assert passed_content
    assert passed_content_comments
    def process_item(self, item, _):
        text = extract_content(item['html'])

        if text:
            item['text'] = text
        else:
            raise DropItem

        return item
Пример #6
0
 def process_one(self, content):
     try:
         soup = BeautifulSoup(content)
         title = soup.title
         f_content = extract_content(content)
         return ContentResult(title, f_content)
     except Exception as e:
         logger.error(f"Dragnet failed on {content.title} with error {e}")
         return ContentResult('', '')
Пример #7
0
 def test_content_and_content_comments_extractor(self):
     content = extract_content(self._html)
     content_comments = extract_comments(self._html)
 
     passed_content = False
     passed_content_comments = False
     for i in range(10):
         # actual_content, actual_content_comments = \
         #     extract_content_and_comments(self._html)
         actual_content = extract_content(self._html)
         actual_content_comments = extract_comments(self._html)
         passed_content = actual_content == content
         passed_content_comments = (
             actual_content_comments == content_comments)
         if passed_content and passed_content_comments:
             break
 
     self.assertTrue(passed_content)
     self.assertTrue(passed_content_comments)
Пример #8
0
 def process_item(self, item, spider):
     fullHTML = item['content']
     content = extract_content(fullHTML)
     item['content'] = unicodedata.normalize("NFKD", content).replace(
         "\n", " ").replace("\t", "").replace("  ", " ")
     item['link_text'] = unicodedata.normalize(
         "NFKD",
         item['link_text']).replace("\n",
                                    " ").replace("\t",
                                                 "").replace("  ", " ")
     return item
Пример #9
0
def do_request(url):
    try:
        requests.head(url, verify=False, timeout=10, headers=headers)
    except Exception:
        return "", 404

    try:
        res = requests.get(url, verify=False, timeout=10, headers=headers)
        content = extract_content(res.content)
        return content, res.status_code
    except Exception:
        return "", 404
def main():
    output = {}
    for path in Path('html').glob('*.html.gz'):
        with gzip.open(path, 'rt', encoding='utf8') as f:
            html = f.read()
        item_id = path.stem.split('.')[0]
        content = extract_content(html, encoding='utf8')
        output[item_id] = {'articleBody': content}
    (Path('output') / 'dragnet.json').write_text(json.dumps(output,
                                                            sort_keys=True,
                                                            ensure_ascii=False,
                                                            indent=4),
                                                 encoding='utf8')
def HTML_to_content(filename):
    """creates a txt file in /data/contentfiles/ with the main content of the HTML file"""
    if not os.path.isfile(
            os.path.join(CONTENT_FILES_PATH, filename[:-5] + '.txt')):
        with open(os.path.join(HTML_FILES_PATH, filename)) as f:
            html_string = f.read()
            try:
                dragnet_result = dragnet.extract_content(html_string)
            except Exception as e:
                print('Dragnet extraction error:', e)
                dragnet_result = 'Dragnet extraction error'

        with open(CONTENT_FILES_PATH + filename[:-5] + '.txt', 'w') as result:
            result.write(dragnet_result.encode('utf-8'))
Пример #12
0
    def process_item(self, response):

        item = MyScraperItem()

        item['url'] = response.url
        item['link_text'] = response.meta['link_text']
        item['company'] = response.meta['company']
        item['content'] = response.body
        item['keywords'] = response.meta['keywords']

        fullHTML = item['content']
        content = extract_content(fullHTML)
        item['content'] = unicodedata.normalize("NFKD", content).replace(
            "\n", " ").replace("\t", "").replace("  ", " ")
        item['link_text'] = unicodedata.normalize(
            "NFKD",
            item['link_text']).replace("\n",
                                       " ").replace("\t",
                                                    "").replace("  ", " ")

        valid = True
        if not item['url']:
            valid = False
            raise DropItem("Missing url!")

        if not item['company']:
            valid = False
            raise DropItem("Missing company!")

        if item['content'] == '':
            valid = False
            raise DropItem("empty content")

        # save to local mongodb database
        if valid:
            res_dict = {key: item[key] for key in item}

            connection = pymongo.MongoClient(port=27017,
                                             username='******',
                                             password='******',
                                             authSource="admin")
            db = connection['admin']
            collection = db[item['company']]
            collection.insert_one(res_dict)
            connection.close()

        return item
Пример #13
0
 def parse_news_text(self, page_html: str, url: str) -> dict:
     news_text = re.sub(r'\s+', r' ',
                        extract_content(page_html, encoding='utf-8'))
     return {'url': url, 'text': news_text}
Пример #14
0
def benchmark(extract_size=800):
    """Picks a random html file and prints an extract of the result of each method"""
    random_file = random_html_file()
    with open(join(DATA_PATH, random_file), 'r') as f:
        html_string = f.read()

        # GOOSE
        try:
            g = Goose({
                'browser_user_agent':
                'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
                'enable_image_fetching': False
            })
            goose_article = g.extract(raw_html=html_string)
            goose_result = goose_article.cleaned_text
        except:
            goose_result = '    Goose error.'

        # EATIHT
        try:
            eatiht_result = eatiht.extract(html_string)
        except:
            eatiht_result = '   Eatiht error.'

        # DRAGNET

        try:
            dragnet_result = dragnet.extract_content(html_string)
        except Exception as e:
            dragnet_result = '  Dragnet error: ' + str(e)

        # LIBEXTRACT

        try:
            textnodes = list(libextract.api.extract(html_string))
            libextract_result = textnodes[0].text_content()
        except:
            libextract_result = '   Libextract error.'

        # BOILERPIPE (CanolaExtractor)

        try:
            extractor = Extractor(extractor='CanolaExtractor',
                                  html=html_string)
            boilerpipe_result = extractor.getText()
        except:
            boilerpipe_result = '   Boilerpipe error.'

        # NEWSPAPER

        try:
            article = Article('url')
            article.download(input_html=html_string)
            article.parse()
            print('Auteurs:', article.authors)
            print('Date de publication:', article.publish_date)
            newspaper_result = article.text
        except:
            newspaper_result = '   Newspaper error.'

        # JUSTEXT

        try:
            paragraphs = justext.justext(html_string,
                                         justext.get_stoplist("French"))
            print('PARAGRAPHS')
            for p in paragraphs:
                if not p.is_boilerplate:
                    print(p.text)
            justext_result = '\n'.join(paragraph.text
                                       for paragraph in paragraphs
                                       if not paragraph.is_boilerplate)
            print('JUSTEXT_RESULT', justext_result)

        except Exception as e:
            justext_result = '   Justext error: ' + str(e)
            print(justext_result)

        # Results

        try:
            # finds the url associated with the file in a "filename-url" csv
            with open('./data/urls.csv', 'r') as csvfile:

                urls = dict((line['id'], line['url'])
                            for line in csv.DictReader(csvfile))
                url = urls[random_file[:-5]]

            print('\n\n >>> URL n.' + random_file[:-5] + ' : ' + url)
        except:
            print(
                '\n\n (URL of the html file not found. To print the associated URL, please provide a urls.csv file featuring filename & url in /data)'
            )
        # webbrowser.open(url, autoraise=False)
        path = abspath('temp.html')
        local_url = 'file://' + path
        with open(path, 'w') as f:
            f.write(html_string)
        webbrowser.open(local_url)

        # print('\n\n     /// GOOSE /// \n')
        # print(goose_result[:extract_size])
        # print('\n\n     /// EATIHT /// \n')
        # print(eatiht_result[:extract_size])
        print('\n ------ [[DRAGNET]] ------', len(dragnet_result),
              'caractères\n')
        print(dragnet_result[:extract_size] + '\n...\n' +
              dragnet_result[-extract_size:])
        print('\n ------ [[NEWSPAPER]] ------', len(newspaper_result),
              'caractères\n')
        print(newspaper_result[:extract_size] + '\n...\n' +
              newspaper_result[-extract_size:])
        print('\n ------ [[JUSTEXT]] ------', len(justext_result),
              'caractères\n')
        print(justext_result[:extract_size] + '\n...\n' +
              justext_result[-extract_size:])
        # print('\n\n     /// LIBEXTRACT /// \n')
        # print(libextract_result[:extract_size])
        # print('\n\n     /// BOILERPIPE (CanolaExtractor) /// \n\n')
        # print(boilerpipe_result[:extract_size])
        # print('\n\n')
        return (url)
Пример #15
0
 def scraper(self, html, link):
     text = extract_content(html)
     if "cnbc" in link:  # in this case content is extracted also in the comments
         text += " " + extract_comments(html)
     text = text.split("disclaimer")[0]
     return text
Пример #16
0
url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml'

response = requests.get(url)
htmlcode = response.content

readability = Readability(htmlcode, url)

print(readability.title)
print(readability.content)

q.d()

s_html = readability.content

from dragnet import extract_content, extract_content_and_comments

print(s_html)
q.d()

content = extract_content(s_html)

print(content)
q.d()

# get article and comments
content_comments = extract_content_and_comments(content)

print(content_comments)

q.d()
Пример #17
0
def get_webpage(url):
    r = requests.get(url)
    content = extract_content(r.content)
    return content
Пример #18
0
        try:
            if result.language == 'ja':
                extractor.analyse(content)
                text, title = extractor.as_text()
                text = re.sub('名前:[^\s]+', '', text)
                text = re.sub('ID:[^\s]+', '', text)
                text = re.sub('https?:[^\s]+', '', text)
                text = re.sub('[0-9]+ +[0-9]+:[0-9]+:[0-9]+\.[0-9]+', '', text)
                text = re.sub('[<<>>]+\s?[0-9\s]+', '', text)
                text = re.sub('引用元:', '', text)
                text = re.sub(
                    '[0-9]+\s+:[0-9]+\/[0-9]+\/[0-9]+.+?[0-9]+:[0-9]+:[0-9]+\.[0-9]+',
                    '', text)
                text = re.sub('<[^>]+>', '', text)
            else:
                text = extract_content(content)
        except Exception as e:
            print(e)

        if not text:
            text = result.summary
        extracted_content = result.title.strip() + " " + text.strip()

        # extract main image
        bs = BeautifulSoup(content, "lxml")
        max = 0
        primary_image_url = None

        for image_url in bs.find_all('img'):
            src = image_url.get('src')
Пример #19
0
    def parse_article(self, response):
        news_id = 19684  #response.meta.get('news_id')

        # save to file
        with open(str(news_id) + '.html', 'wb') as fh:
            fh.write(response.body)
        article = Article(response.url)
        # set html manually
        with open(str(news_id) + '.html', 'rb') as fh:
            article.html = fh.read()
        os.remove(str(news_id) + '.html')
        # need to set download_state to 2 for this to work
        article.download_state = 2
        article.parse()
        article.nlp()
        date = article.publish_date
        keywords = str([x.replace("'", "''")
                        for x in article.keywords]).replace('"', '\'')
        content = article.text.replace("'", "''")
        summary = article.summary.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([x.replace("'", "''")
                    for x in article.meta_keywords]).replace('"', '\'')

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", '
            + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' +
            str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + summary + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' +
            title + '\')')

        # get main article without comments
        content = extract_content(response.text).replace("'", "''")

        # get article and comments
        content_comments = '[\'' + extract_content_and_comments(
            response.text).replace("'", "''") + '\']'

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") '
            + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' +
            str(content_comments) + '::text[])')

        date = articleDateExtractor.extractArticlePublishedDate(
            articleLink=response.url, html=response.text)
        if date is not None:
            dbconnector.execute(
                self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' +
                'VALUES (' + str(news_id) + ', \'' + str(date) + '\')')

        g = Goose()
        article = g.extract(raw_html=response.text)
        date = article.publish_datetime_utc
        keywords = str([x.replace("'", "''")
                        for x in article.tags]).replace('"', '\'')
        content = article.cleaned_text.replace("'", "''")
        summary = article.meta_description.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([
            x.replace("'", "''") for x in article.meta_keywords.split(",")
        ]).replace('"', '\'')
        tweets = str([x.replace("'", "''")
                      for x in article.tweets]).replace('"', '\'')

        dbconnector.execute(
            self.conn, 'INSERT INTO "ParsedNews-goose"(' +
            '"IDNews", "Date", "Content", "Keywords", "Summary", ' +
            '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' +
            str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) +
            '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) +
            '\')')

        pass
Пример #20
0
    def save_snapshot(self):
        try:
            r = requests.get(self.url)
        except (
            requests.exceptions.SSLError,
            requests.exceptions.ConnectionError,
            requests.exceptions.ReadTimeout,
        ) as e:
            print(e)
            return None

        snapshot = {
            "bookmark": self,
            "content": r.text,
            "headers_json": json.dumps(
                {item[0]: item[1] for item in r.headers.items()}
            ),
            "status_code": r.status_code,
        }

        try:
            ogp = LaterOpenGraph(html=r.text)
            snapshot["opengraph_json"] = ogp.to_json()
        except AttributeError:
            print("OpenGraph Error")
            pass

        try:
            snapshot["parsed_content"] = extract_content(r.text)
        except BlockifyError:
            print("dragnet extract_content: BlockifyError")
            snapshot["parsed_content"] = ""
            pass

        try:
            tags = favicon_tags(self.url, r.text)
            tags = sorted(tags, key=lambda i: i.width + i.height, reverse=True)
            snapshot["favicon"] = tags[0].url
            print(snapshot["favicon"])
        except IndexError:
            print("No Favicon Found")
            pass

        try:
            tr4w = TextRank4Keyword()
            tr4w.analyze(snapshot["parsed_content"])
            keywords_weighted = tr4w.node_weight.items()
            keywords_sorted = sorted(
                keywords_weighted, key=lambda item: item[1], reverse=True
            )
            tags = [k.lower() for (k, v) in keywords_sorted if len(k) < 100][:9]
            self.tags.add(*tags)
        except MemoryError:
            print("MemoryError while parsing keywords")
            pass

        # If the bookmark does not yet have a title, grab it from the document title
        if not self.title:
            try:
                parser = etree.XMLParser(recover=True)
                document = etree.fromstring(r.text, parser)
                self.title = document.find(".//title").text
                self.save()
            except ValueError:
                print("Error parsing document...")
                pass
            except AttributeError:
                print("No title tag found...")
                pass

        # If we still don't have a title, grab it from the opengraph tags
        if not self.title and ogp.get("title"):
            self.title = ogp.get("title")
            self.save()

        return Snapshot.objects.create(**snapshot)
Пример #21
0
def run_dragnet(htmlstring):
    '''try with the dragnet module'''
    content = extract_content(htmlstring)
    return content  # sanitize(content)
Пример #22
0
 def parse_keywords(self, response):
     item = Content()
     item["object_id"] = self.object_id
     item["content"] = extract_content(response.body)
     return item
Пример #23
0
import requests
from dragnet import extract_content, extract_content_and_comments
import q

# fetch HTML
# url = 'https://moz.com/devblog/dragnet-content-extraction-from-diverse-feature-sets/'
url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml'
r = requests.get(url)

# get main article without comments
content = extract_content(r.content)

print(content)
q.d()

# get article and comments
content_comments = extract_content_and_comments(r.content)

print(content_comments)

q.d()
    def process_item(self, item, spider):
        fullHTML = item['content']
        content = extract_content(fullHTML)
        item['content'] = content

        return item
Пример #25
0
def dragnet_extract_content():
    return extract_content(request.data.decode('utf-8'))
 def extract(self, html):
     if self.warm is False:
         string = dragnet.extract_content("test")
         print("Warmed DrageNet Model")
         self.warm = True
     return dragnet.extract_content(html)
Пример #27
0
def results():
    # form = request.form
    if request.method == 'POST':
        # write your function that loads the model
        # model = get_model() #you can use pickle to load the trained model
        # model = pickle.load(open('model.pkl', 'rb'))

        # Extract the content
        url = request.form['url']
        r = requests.get(url)
        content = extract_content(r.content)
        # text = content.split('\n')[0] + content.split('\n')[1]  ## get the first and second sentence

        # Extract the headline
        headline = headline_func(url)

        # merge the headline and the first sentence
        text = headline + " " + content.split('\n')[0]

        # pass the text into preprocessing function
        preprocessed_text = text_prep(text)

        # #predict gategory
        predicted = cat_prediction(preprocessed_text)[0]
        predicted = category.get(predicted)

        # #Predicting the topics for a document
        doc = preprocessed_text.split()
        doc_vector = lda_model.id2word.doc2bow(doc)
        doc_topics = lda_model[doc_vector]
        sorted_by_prob = sorted(doc_topics,
                                key=lambda tup: tup[1],
                                reverse=True)

        # return render_template('resultsform.html', text=text, predicted_category=predicted)
        # return Response()

        # #Sentiment of the News by Vader
        text_series = pd.Series(preprocessed_text)
        score = text_series.apply(
            lambda t: analyser.polarity_scores(t)['compound'])
        sentiment_vader = 'positive' if score[
            0] > 0 else 'negative' if score[0] < 0 else 'neutral'

        # Sentiment by textblob: PatterAnalyzer
        blob = TextBlob(text)
        pol = blob.sentences[0].sentiment.polarity
        print(text)

        return jsonify({
            'data': {
                "category": predicted,
                "url": url,
                "body": content,
                "topics": topics.get(sorted_by_prob[0][0]),
                "Sentiment_vader": sentiment_vader,
                "Sentiment_score_vader": score[0],
                "Sentiment_textblob": pol,
                "headline": headline
            }
        })