def __init__(self):
     self.urls = url_manager.Url_Manager()
     self.downloader = html_downLoader.HtmlDownLoader()
     self.parser = html_parser.HtmlParser()
     self.outper = html_outputer.OutPuter()
     #self.cache=disk_cache.DiskCache()
     self.cache=mongo_cache.MongoCache()
예제 #2
0
def main():
    template_url = 'http://127.0.0.1:8000/places/ajax/search.json?page={}&page_size=10&search_term={}'
    countries = set()
    # download = downloader.Downloader(mongo_cache.MongoCache())
    cache = mongo_cache.MongoCache()
    cache.clear()
    download = downloader.Downloader(cache=cache)

    for letter in string.lowercase:
        page = 0
        while True:
            html = download(template_url.format(page, letter))
            try:
                ajax = json.loads(html)
            except ValueError as e:
                print e
                ajax = None
            else:
                for record in ajax['records']:
                    countries.add(record['country'])
            page += 1
            if ajax is None or page >= ajax['num_pages']:
                break

    open('countries.txt', 'w').write('\n'.join(sorted(countries)))
예제 #3
0
def download_comments(self, url):

    # 按页将评论存储起来了,还需要进一步净化按指定格式存储
    downloader = Downloader(cache=mongo_cache.MongoCache())
    try:
        data, code = downloader(url)
        if not data:
            raise Exception('抓取结果为空')

        dictStr = data[data.find('(') + 1:-2]
        astDictStr = dictStr.replace("true", "True").replace("false", "False").replace("null", "None")
        dictResult = ast.literal_eval(astDictStr)
        oriComments = dictResult["comments"]
        # 精处理我们所需信息,提取 content,score
        comments = []
        for com in oriComments:
            comments.append(
                 {
                      'content':com["content"].decode("GBK").replace(u'\n',u'。'),
                      'score':com["score"]
                 }
              )
        return comments
    except Exception as exc:
        raise self.retry(exc=exc)  # 遵从默认重连参数
예제 #4
0
def main():
    writer = csv.writer(open(CSV_FILE, 'w'))
    downloader = Downloader(cache=mongo_cache.MongoCache())
    html = downloader(URL)
    ajax = json.loads(html)
    for record in ajax['records']:
        writer.writerow([record['country']])
    print('Records written in {}'.format(CSV_FILE))
예제 #5
0
def task(base_url, start, stop, db_helper, col):

    cookie = {'Cookie': '%s=%s' % (k, v) for k, v in COOKIE.items()}
    downloader = Downloader(cache=mongo_cache.MongoCache(), cookie=cookie)
    for i in range(start, stop, 20):
        new_url = set_query_parameter(base_url, 'start', i)

        # ---------------IO流中的输入流----------------------------------------
        html, code = downloader(new_url)
        if code != 200:
            print u'访问出错...'
            break
        # -------------格式化处理--------------------------------------------
        comm_elements = lxml.html.fromstring(html).cssselect(
            'div#comments .comment')
        comments = []
        for e in comm_elements:
            # 构建mongodb中comment文档
            comment = {}
            comment['comment-vote'] = int(
                e.cssselect('h3 span.comment-vote span.votes')
                [0].text_content())
            if not e.cssselect('h3 span.comment-info span.rating'):
                comment['comment-roting'] = 0
            else:
                comment['comment-roting'] = int(
                    e.cssselect('h3 span.comment-info span.rating')[0].get(
                        'class').split(' ')[0][-2])
            comment['comment-text'] = e.cssselect(
                'p')[0].text_content().encode('utf-8')
            comments.append(comment)
        # ----------IO流中的输出流--------------------------------------------
        print len(comments)
        try:
            db_helper.insert_documents(collection=col, documents=comments)
        finally:
            db_helper.close()
        print 'finish %d th download' % (i / 20)
예제 #6
0
def main(string):
    countries = set()
    downloader = Downloader(cache=mongo_cache.MongoCache())

    for letter in string.lower():
        page = 0
        while True:
            html = downloader(TEMPLATE_URL.format(page, letter))
            try:
                ajax = json.loads(html)
            except ValueError as e:
                print(e)
                ajax = None
            else:
                for record in ajax['records']:
                    countries.add(record['country'])
            page += 1
            if ajax is None or page >= ajax['num_pages']:
                break

    with open(TXT_FILE, 'w') as f:
        f.write('\n'.join(sorted(countries)))
    print('Records written in {}'.format(TXT_FILE))
예제 #7
0
def main():
    template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}'
    countries = set()
    html_cache = mongo_cache.MongoCache()
    download = downloader.Downloader(delay=3, num_retries=1, timeout=60, cache=html_cache) 

    for search_term in string.ascii_lowercase:
        page = 1
        while True:
            html = download(template_url.format(page, search_term))
            try:
                ajax = json.loads(html)
            except ValueError as e:
                print (e)
                ajax = None
            else:
                for record in ajax['records']:
                    countries.add(record['country'])
            page += 1
            if ajax is None or page >= ajax['num_pages']:
                break
    
    open('countries.txt', 'w').write('\n'.join(sorted(countries)))
예제 #8
0
def main():
    template_url = 'http://example.webscraping.com/places/ajax/search.json?&search_term={}&page_size=10&page={}'
    countries = set()  # 先将数据存储在集合中,因为集合这种数据类型不会存储重复的元素
    download = downloader.Downloader(mongo_cache.MongoCache())

    for letter in string.lowercase:  # 从a-z进行遍历搜索
        page = 0
        while True:
            html = download(template_url.format(letter, page))
            try:
                ajax = json.loads(html)  # 将JSON格式的数据使用json模块解析成一个字典
            except ValueError as e:
                print e
                ajax = None
            else:
                for record in ajax['records']:
                    # print record['country']
                    countries.add(record['country'])
            page += 1
            if ajax is None or page >= ajax['num_pages']:
                break

    open('D:\countries.txt', 'w').write('\n'.join(sorted(countries)))
예제 #9
0
def process_crawler(args, **kwargs):
    num_cpus = multiprocessing.cpu_count()
    #pool = multiprocessing.Pool(processes=num_cpus)
    print('Starting {} processes'.format(num_cpus))
    html_cache1 = mongo_cache.MongoCache()
    html_cache1.clear()
    mongo_result1 = mongo_result.MongoResult()
    mongo_result1.clear()

    processes = []
    for i in range(num_cpus):
        p = multiprocessing.Process(target=threaded_crawler(
            seed_url=target_url,
            link_regex='/(places|view)',
            scrape_callback=mongo_result1,
            html_cache=html_cache1).run,
                                    args=[args],
                                    kwargs=kwargs)
        #parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
        p.start()
        processes.append(p)
    # wait for processes to complete
    for p in processes:
        p.join()