def parse(self, response): url = response.url print "parse article: [%s]" % url title = response.xpath('//title/text()').extract()[0] print title if title != " 404 NOT FOUND ": # 解析文章 # self.write2file("articles", context[0].decode('utf-8')) item = self.parse_page_contents(response) self.used_urls.append(url) yield self.getUrls(item, response) else: self.log("article: [%s] not exist!!!" % response.url) while self.url_pools.qsize() > 0: url = self.url_pools.get()[1] uri = url.split('/')[-1] if url not in self.used_urls: if (not MongoDBUtil.isUrlScrawled(uri)): print "crawl next url: " + url yield scrapy.Request(url, cookies=self.cookies, headers=self.headers) break while self.url_pools.qsize() > 0: url = self.url_pools.get()[1] uri = url.split('/')[-1] if url not in self.used_urls: if (not MongoDBUtil.isUrlScrawled(uri)): print "crawl next url: " + url yield scrapy.Request(url, cookies=self.cookies, headers=self.headers) break
def __init__(self): self.source = "www.atatech.org" self.user_names = [] self.headers = HEADER self.cookies = COOKIES self.url_pools.put((1, "https://www.atatech.org/articles/72373")) MongoDBUtil.__init__() startUrl = MongoDBUtil.get_start_article_url() if startUrl is not None: startUrl = "https://www.atatech.org/articles/" + startUrl self.url_pools.get() self.url_pools.put((1, startUrl))
def save2(): authorItem = AuthorItem() authorItem['authorName'] = "thomugo" authorItem['authorHomePage'] = "www.baidu.com" blogItem = AtablogItem() blogItem['id'] = "123" blogItem['blog'] = "test" blogItem['dir'] = './test' blogItem['title'] = "title" blogItem['tags'] = ['a', 'b'] blogItem['mark'] = 2 blogItem['vote'] = 3 MongoDBUtil.save_blog(authorItem, blogItem)
def persistenceBlog(self, authorItem, blogItem): uri = blogItem['id'] if not os.path.exists(blogItem['dir']): os.makedirs(blogItem['dir']) fileName = blogItem['dir'] + blogItem['title'] + ".md" alreadyWritten = os.path.exists(fileName) alreadyScrawled = MongoDBUtil.isUrlScrawled(uri) if not alreadyWritten: # 将爬取到的文章保存到本地文件系统 with codecs.open(fileName, 'wb', encoding="utf-8") as md: md.write(blogItem['blog']) if not alreadyScrawled: # 将解析到的article保存到数据库 MongoDBUtil.save_blog(authorItem, blogItem)
def parse_similar_articles(self, response): print "in parse_similar_articles" context = response.xpath('//table').extract()[0].decode('utf-8') urls = response.xpath( '//a[contains(@href, "/articles")]/@href').extract() for url in urls: self.log("similar url: " + response.urljoin(url)) self.url_pools.put((3, response.urljoin(url))) while self.url_pools.qsize() > 0: url = self.url_pools.get()[1] uri = url.split('/')[-1] if url not in self.used_urls: if (not MongoDBUtil.isUrlScrawled(uri)): print "crawl next url: " + url yield scrapy.Request(url, cookies=self.cookies, headers=self.headers) break yield response.meta['item']
def testGetBlog(): blog = MongoDBUtil.get_blog("0、Python与设计模式--前言") with codecs.open(u"0、Python与设计模式--前言.md", 'wb', encoding="utf-8") as md: md.write(blog['blog'])
def getStartUrl(): print MongoDBUtil.get_last_article_url()
#!/usr/bin/env python # -*- coding: utf-8 -*- import Queue import codecs from AtaBlog.MongoDBUtil import MongoDBUtil from AtaBlog.items import * MongoDBUtil.__init__() def save1(): # authorItem = AuthorItem() # authorItem['authorName'] = "thomugo" # authorItem['authorHomePage'] = "www.baidu.com" # author = dict(authorItem) #MongoDBUtil.save_author_item(authorItem) #MongoDBUtil.authorItemCollection.insert(author) authorItem = MongoDBUtil.authorItemCollection.find_one({'authorName':"thomugo"}) blogItem = AtablogItem() blogItem['id'] = "123" blogItem['blog'] = "test" blogItem['dir'] = './test' blogItem['title'] = "title" blogItem['tags'] = ['a', 'b'] blogItem['mark'] = 2 blogItem['vote'] = 3 blogItem['author'] = authorItem blog = dict(blogItem) MongoDBUtil.blogItemCollection.insert(blog) #MongoDBUtil.save_blog_item(blogItem)