Пример #1
0
 def parse(self, response):
     url = response.url
     print "parse article: [%s]" % url
     title = response.xpath('//title/text()').extract()[0]
     print title
     if title != " 404 NOT FOUND ":
         # 解析文章
         # self.write2file("articles", context[0].decode('utf-8'))
         item = self.parse_page_contents(response)
         self.used_urls.append(url)
         yield self.getUrls(item, response)
     else:
         self.log("article: [%s] not exist!!!" % response.url)
         while self.url_pools.qsize() > 0:
             url = self.url_pools.get()[1]
             uri = url.split('/')[-1]
             if url not in self.used_urls:
                 if (not MongoDBUtil.isUrlScrawled(uri)):
                     print "crawl next url: " + url
                     yield scrapy.Request(url,
                                          cookies=self.cookies,
                                          headers=self.headers)
                     break
     while self.url_pools.qsize() > 0:
         url = self.url_pools.get()[1]
         uri = url.split('/')[-1]
         if url not in self.used_urls:
             if (not MongoDBUtil.isUrlScrawled(uri)):
                 print "crawl next url: " + url
                 yield scrapy.Request(url,
                                      cookies=self.cookies,
                                      headers=self.headers)
                 break
Пример #2
0
 def __init__(self):
     self.source = "www.atatech.org"
     self.user_names = []
     self.headers = HEADER
     self.cookies = COOKIES
     self.url_pools.put((1, "https://www.atatech.org/articles/72373"))
     MongoDBUtil.__init__()
     startUrl = MongoDBUtil.get_start_article_url()
     if startUrl is not None:
         startUrl = "https://www.atatech.org/articles/" + startUrl
         self.url_pools.get()
         self.url_pools.put((1, startUrl))
Пример #3
0
def save2():
    authorItem = AuthorItem()
    authorItem['authorName'] = "thomugo"
    authorItem['authorHomePage'] = "www.baidu.com"
    blogItem = AtablogItem()
    blogItem['id'] = "123"
    blogItem['blog'] = "test"
    blogItem['dir'] = './test'
    blogItem['title'] = "title"
    blogItem['tags'] = ['a', 'b']
    blogItem['mark'] = 2
    blogItem['vote'] = 3
    MongoDBUtil.save_blog(authorItem, blogItem)
Пример #4
0
 def persistenceBlog(self, authorItem, blogItem):
     uri = blogItem['id']
     if not os.path.exists(blogItem['dir']):
         os.makedirs(blogItem['dir'])
     fileName = blogItem['dir'] + blogItem['title'] + ".md"
     alreadyWritten = os.path.exists(fileName)
     alreadyScrawled = MongoDBUtil.isUrlScrawled(uri)
     if not alreadyWritten:
         # 将爬取到的文章保存到本地文件系统
         with codecs.open(fileName, 'wb', encoding="utf-8") as md:
             md.write(blogItem['blog'])
     if not alreadyScrawled:
         # 将解析到的article保存到数据库
         MongoDBUtil.save_blog(authorItem, blogItem)
Пример #5
0
 def parse_similar_articles(self, response):
     print "in parse_similar_articles"
     context = response.xpath('//table').extract()[0].decode('utf-8')
     urls = response.xpath(
         '//a[contains(@href, "/articles")]/@href').extract()
     for url in urls:
         self.log("similar url: " + response.urljoin(url))
         self.url_pools.put((3, response.urljoin(url)))
     while self.url_pools.qsize() > 0:
         url = self.url_pools.get()[1]
         uri = url.split('/')[-1]
         if url not in self.used_urls:
             if (not MongoDBUtil.isUrlScrawled(uri)):
                 print "crawl next url: " + url
                 yield scrapy.Request(url,
                                      cookies=self.cookies,
                                      headers=self.headers)
                 break
     yield response.meta['item']
Пример #6
0
def testGetBlog():
    blog = MongoDBUtil.get_blog("0、Python与设计模式--前言")
    with codecs.open(u"0、Python与设计模式--前言.md", 'wb', encoding="utf-8") as md:
        md.write(blog['blog'])
Пример #7
0
def getStartUrl():
    print MongoDBUtil.get_last_article_url()
Пример #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import Queue
import codecs

from AtaBlog.MongoDBUtil import MongoDBUtil
from AtaBlog.items import *

MongoDBUtil.__init__()

def save1():
    # authorItem = AuthorItem()
    # authorItem['authorName'] = "thomugo"
    # authorItem['authorHomePage'] = "www.baidu.com"
    # author = dict(authorItem)
    #MongoDBUtil.save_author_item(authorItem)
    #MongoDBUtil.authorItemCollection.insert(author)
    authorItem = MongoDBUtil.authorItemCollection.find_one({'authorName':"thomugo"})
    blogItem = AtablogItem()
    blogItem['id'] = "123"
    blogItem['blog'] = "test"
    blogItem['dir'] = './test'
    blogItem['title'] = "title"
    blogItem['tags'] = ['a', 'b']
    blogItem['mark'] = 2
    blogItem['vote'] = 3
    blogItem['author'] = authorItem
    blog = dict(blogItem)
    MongoDBUtil.blogItemCollection.insert(blog)
    #MongoDBUtil.save_blog_item(blogItem)