Exemplo n.º 1
0
    def _gen_item(self, element):
        item = Article()
        item.url = self.url 
        item.title = element.find('h4').text()
        item.author = u'廖雪峰'#element.find('').text()
        item.category = 'Dev'
        item.tags = 'Python'
        contentWrapper = element.find('.x-wiki-content')#.remove('.postComment')
        item.summary = tool.Tool().replace2(contentWrapper.outerHtml().encode('utf-8'))
        item.crawl_time = datetime.datetime.now()

        return item
Exemplo n.º 2
0

cachedfile = fetch(url)

# connect to database
from sqlalchemy import *
from model import Article

db = create_engine("sqlite:///nar.db", echo=True)

from sqlalchemy.orm import sessionmaker

Session = sessionmaker(bind=db)
session = Session()

# parse and extract abstract URLs
db = create_engine("sqlite:///tutorial.db")
html = open("cache/%s" % cachedfile).read()
soup = BeautifulSoup(html, "html5lib")
elements = soup.find_all("a", rel="abstract")
for element in elements:
    uri = element.get("href")
    abstract_url = urljoin(url, uri)
    print(abstract_url)
    fetch(abstract_url)

    article = Article()
    article.url = abstract_url
    session.add(article)
session.commit()