Пример #1
0
        self.thread_num = thread_num

    def new_worker(self):
        def start_thread_manager():
            manager = MultiThreadWorkerManager(self.exit_flag, self.job_queue,
                                               self.done_queue,
                                               self.thread_num, self.worker_constructor,
                                               self.constructor_args, self.constructor_kwargs)
            manager.monitor_worker()
        return multiprocessing.Process(target=start_thread_manager)

if __name__ == '__main__':
    begin_time = time.clock()
    name_red = 'testA'
    name_eu = 'testB'
    redis_que = RedisQueues(name_red)
    eu_que = EuQueue(name_eu, redis_que)
    # eu_que = EuQueue()

    hosts = ["http://www.yahoo.com", "http://google.com","http://www.amazon.com","http://www.taobao.com/","http://www.apple.com","http://www.baidu.com"]
    #提取已经下载的页面里面的URL,保存至队列中
    for item in hosts:
        print "item1:",  item
        redis_que.put(item)

    exit_flag = multiprocessing.Value('i', 0, lock=True)
    print exit_flag
    worker_num = 1
    max_job_num = 1
    # crawler_mas = MultiThreadWorkerManager(exit_flag, redis_que, eu_que, worker_num, CuWorker, max_job_num)
    crawler_mas =  MultiProcessMultiThreadWorkerManager(exit_flag, redis_que, eu_que, worker_num, CuWorker, max_job_num)
Пример #2
0
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                save_url_ToQueue(url)
                # self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key,
                         url)
            save_url_ToQueue(url)
            # self.parse_html(url)
        else:
            return


# from datetime import datetime

if __name__ == "__main__":

    # border_location_right = {
    #     "lat": 24.970,
    #     "lng": 121.666
    # }
    # print gpp.get_url(app_keys[0], border_location_right, 'food')
    name_red = 'testA'
    redis_que = RedisQueues(name_red)
    print redis_que.length()
    gpp = GooglePlacesParser(redis_que)
    gpp.save_url_ToQueue()
    print redis_que.length()
Пример #3
0
            manager = MultiThreadWorkerManager(self.exit_flag, self.job_queue,
                                               self.done_queue,
                                               self.thread_num,
                                               self.worker_constructor,
                                               self.constructor_args,
                                               self.constructor_kwargs)
            manager.monitor_worker()

        return multiprocessing.Process(target=start_thread_manager)


if __name__ == '__main__':
    begin_time = time.clock()
    name_red = 'testA'
    name_eu = 'testB'
    redis_que = RedisQueues(name_red)
    eu_que = EuQueue(name_eu, redis_que)
    # eu_que = EuQueue()

    hosts = [
        "http://www.yahoo.com", "http://google.com", "http://www.amazon.com",
        "http://www.taobao.com/", "http://www.apple.com",
        "http://www.baidu.com"
    ]
    #提取已经下载的页面里面的URL,保存至队列中
    for item in hosts:
        print "item1:", item
        redis_que.put(item)

    exit_flag = multiprocessing.Value('i', 0, lock=True)
    print exit_flag
Пример #4
0
        # time.sleep(60)

    print 'the information about all people has saved!'
    print '完成 退出'


# url = 'https://s3.amazonaws.com/codecave/tut1.html'
# url= 'http://www.douban.com/group/search?cat=1019&q=%E6%9C%88%E5%AB%82'
# data_words = ['妈妈','宝宝','怀孕','待产包','月子餐','月嫂']
if __name__ == '__main__':
    # pool = multiprocessing.Pool(processes=4)
    start = time.clock()
    dbs = MongoCRUD()
    data_words = ['妈妈', '宝宝', '怀孕', '待产包', '月子餐', '月嫂', '妈咪']
    data_words = ['妈妈']
    group_queue = RedisQueues('GROUP_QUEUE1')
    group_queue.delete()
    # all = get_url_all(data_words,group_queue)
    # get_url_all(data_words,group_queue,dbs)
    # print len(all)
    # print '豆瓣用户UID存入数据库'
    # dbs.people_url_insert(all)
    save_data()
    elapsed = (time.clock() - start)
    print("Time used:", elapsed)
    '''
    people_queue = RedisQueues('PEOPLE_QUEUE1')

    dbs = MongoCRUD()
    # get_data(data_words,group_queue,people_queue)
    people_all =get_data(data_words,group_queue,people_queue)
Пример #5
0
            self.crud.save_map_data_insert(results)
            if 'next_page_token' in josn_response:
                pagetoken = '&pagetoken=%s' % josn_response['next_page_token']
                url = re.sub(r'&pagetoken=.*', pagetoken, url)
                save_url_ToQueue(url)
                # self.parse_html(url)
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url)
            save_url_ToQueue(url)
            # self.parse_html(url)
        else:
            return

# from datetime import datetime

if __name__ == "__main__":

    # border_location_right = {
    #     "lat": 24.970,
    #     "lng": 121.666
    # }
    # print gpp.get_url(app_keys[0], border_location_right, 'food')
    name_red = 'testA'
    redis_que = RedisQueues(name_red)
    print redis_que.length()
    gpp = GooglePlacesParser(redis_que)
    gpp.save_url_ToQueue()
    print redis_que.length()
Пример #6
0
            else:
                pass
        elif status == 'OVER_QUERY_LIMIT':
            self.key = self.app_keys_pop()
            url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key,
                         url)
            self.parse_html(url)
        else:
            return
        '''
    def run(self):
        all_locations = self.crud.read_all_locations()
        if len(self.keys) > 0:
            for location in all_locations:
                for type in types:
                    url = self.get_url(location, type)
                    print url
                    self.parse_html(url)
                self.crud.update_location_status(location['_id'])
        else:
            print "*------*-*all app keys have been used*-*-----*"
            sys.exit()

      '''


if __name__ == '__main__':
    name_red = 'testA'
    redis_que = RedisQueues(name_red)
    # eu_que = EuQueue(redis_que)