def _gen_item(self, element): item = Article() item.url = self.url item.title = element.find('h4').text() item.author = u'廖雪峰'#element.find('').text() item.category = 'Dev' item.tags = 'Python' contentWrapper = element.find('.x-wiki-content')#.remove('.postComment') item.summary = tool.Tool().replace2(contentWrapper.outerHtml().encode('utf-8')) item.crawl_time = datetime.datetime.now() return item
cachedfile = fetch(url) # connect to database from sqlalchemy import * from model import Article db = create_engine("sqlite:///nar.db", echo=True) from sqlalchemy.orm import sessionmaker Session = sessionmaker(bind=db) session = Session() # parse and extract abstract URLs db = create_engine("sqlite:///tutorial.db") html = open("cache/%s" % cachedfile).read() soup = BeautifulSoup(html, "html5lib") elements = soup.find_all("a", rel="abstract") for element in elements: uri = element.get("href") abstract_url = urljoin(url, uri) print(abstract_url) fetch(abstract_url) article = Article() article.url = abstract_url session.add(article) session.commit()