Пример #1
0
 def getArticle(self, url, rawHTML, language=None):
     config = Configuration()
     if language:
         config.targetLanguage = language
         config.useMetaLanguge = False
     config.enableImageFetching = False
     g = Goose(config=config)
     article = g.extractContent(url=url, rawHTML=rawHTML)
     return article
Пример #2
0
 def getArticle(self, url, rawHTML, language=None):
     config = Configuration()
     if language:
         config.targetLanguage = language
         config.useMetaLanguge = False
     config.enableImageFetching = False
     g = Goose(config=config)
     article = g.extractContent(url=url, rawHTML=rawHTML)
     return article
Пример #3
0
 def getArticle(self, url, rawHTML):
     config = Configuration()
     config.enableImageFetching = False
     g = Goose(config=config)
     article = g.extractContent(url=url, rawHTML=rawHTML)
     return article
Пример #4
0
import re
import time
import pymongo
from goose.Goose import Goose
from bs4 import BeautifulSoup
from pymongo import MongoClient

client = MongoClient('localhost',27017)
db = client.news
collection = db.it_news

crawl_list = ['business-issues','consumer-electronics','telecommunication','internet','networking','peripherals','security','services','software','storage','virtualization','hardware-systems-0','government','business-issues','cousumer-electronics','legal','it-management']
page_max = 400
item_per_page = 10

go = Goose()
url_base = 'http://www.itnews.com/'
for tag1 in crawl_list:
    print "TAG:::::::" + tag1
    for page in range(0,page_max):
        new_articles = []

        if page == 0:
            url_page = ''
        else:
            url_page = '?page=' + str(page)

        url_full = url_base + tag1 + url_page
        print "URL::::::" + url_full
        time.sleep(1)
        res = requests.get(url_full)
Пример #5
0
def ext_goosepy(html):
    g = Goose()
    g.config.enableImageFetching = False
    article = g.extractContent(url="http://www.example.com/test.html", rawHTML=html)
    return article.cleanedArticleText