def qyer_country_spider(self, country_id, country_link, debug=False, **kwargs):
    """
    抓取穷游上的城市数据
    country_id:
        int, index country info
    country_en:
        str. country_en
    country_link:
        str.
    """
    http_tools = init_qyer_session(debug=True)
    x = time.time()
    spider_proxy = "socks5://" + get_proxy(source="Platform")
    qyer_db = QyerModel(**save_db_config)

    try:
        spider_ret = http_tools(country_link, proxy=spider_proxy)
        status_code = spider_ret[1]
        if status_code != 200 and status_code != 404:
            raise Exception(str(status_code))

        page_html = etree.HTML(spider_ret[0])
        country_max_page = find_max_page(page_html)
        save_data = [country_max_page, country_id]
        qyer_db.update_country_page(save_data)
        update_task(kwargs['task_id'])
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
def daodao_img_rename_task(self, file_name, src_path, dst_path, bucket_name,
                           img_url, mid, table_name, **kwargs):
    self.task_source = 'TripAdvisor'
    self.task_type = 'ImgRename'

    try:
        src_file = os.path.join(src_path, file_name)
        flag, h, w = is_complete_scale_ok(src_file)
        f_md5 = file_md5(src_file)
        size = unicode((h, w))
        if flag == 0 or flag == 4:
            __used = u'1' if flag == 0 else u'0'
            data = (file_name, unicode(mid), unicode(img_url),
                    unicode(bucket_name),
                    size, unicode(file_name).replace(u'.jpg', u''),
                    unicode(f_md5), u'machine', __used, u'online')
            try:
                # 暂时没有解决这三个函数的事务关系,所以将重要性最低的函数前置执行
                shutil.copy(src_file, os.path.join(dst_path, file_name))
                print insert_db(data, table_name)
            except Exception as e:
                raise e
            update_task(kwargs['task_id'])
        else:
            raise Exception('Error Flag')
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
示例#3
0
def tripadvisor_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#4
0
def hotel_base_data(self, source, url, other_info, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        page = requests.get(url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        content = page.text
        # agoda 特殊情况 start
        url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format(
            other_info['source_id'])
        page_about = requests.get(url=url_about, headers=headers)
        page_about.encoding = 'utf8'
        about_content = page_about.text
        other_info['about_content'] = about_content

        # agoda end
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
def hotel_tax_detail(self, task_content, city_id, **kwargs):
    try:
        task = Task()
        task.content = task_content
        result = hotel_tax(task, city_id)
        data = result.values()[-1][-1]
        data['task_content'] = task_content
        data['city_id'] = city_id
        table.insert(data)
        if kwargs.get('task_id'):
            update_task(kwargs['task_id'])
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
示例#6
0
def list_page_task(self, ctx, city_id, **kwargs):
    self.task_source = 'TripAdvisor'
    self.task_type = 'HotelList'
    with MySession() as session:
        try:
            session.headers.update(ctx['headers'])
            resp = session.get(ctx['url'])
            jq = PyQuery(resp.text)

            # 爬取详情页
            doc_a_href = jq(".property_title")

            for each in doc_a_href.items():
                # 详情页 id
                detail_id = each.attr("id").split('_')[-1]
                # 详情页链接
                detail_url = urlparse.urljoin(resp.url, each.attr("href"))
                collections.save({
                    'city_id': city_id,
                    'source_id': detail_id,
                    'source_url': detail_url,
                    'task_id': kwargs['task_id'],
                    'page_index': ctx['page_index']
                })

            # 爬取下一页,如果不是第一页,这部分不进行
            if ctx['page_index'] == 0:
                total_page = jq(".pageNum.last").attr("data-page-number")
                for i in range(1, int(total_page) + 1):
                    # 用对方的 city_id 生成抓取信息
                    ctx = init_header(ctx['source_city_id'], i)

                    # 分发异步任务
                    app.send_task('proj.tripadvisor_list_tasks.list_page_task',
                                  args=(
                                      ctx,
                                      city_id,
                                  ),
                                  kwargs=kwargs,
                                  queue='tripadvisor_list_tasks',
                                  routing_key='tripadvisor_list_tasks')

            update_task(kwargs['task_id'])
        except Exception as exc:
            session.update_proxy('23')
            self.retry(exc=traceback.format_exc(exc))
示例#7
0
def shutter_spider(self, vid, search_kw, debug=False, **kwargs):
    """
    shutterstock 图片搜索爬取
    """
    if search_kw is None or search_kw == "null":
        # todo logging null key words
        return None
    x = time.time()
    spider_proxy = 'socks5://' + get_proxy(source="Platform")
    try:
        spider = ShutterShockPicSpider(search_kw, spider_proxy, debug)
        pic_ret = spider.pic_search()
        pic_save_data = shutter_pic_data_assembly(vid, search_kw, pic_ret)
        spider_db = PicModel(**save_db_config)
        for _, save_data_map in pic_save_data.items():
            spider_db.insert_pic_many(save_data_map["table"],
                                      save_data_map["fields"],
                                      save_data_map["values"])
        update_task(kwargs['task_id'])
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#8
0
def qyer_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {
        'User-agent': GetUserAgent(),
        'Referer': "http://www.qyer.com/",
    }

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
def hotel_tax_list_task(self, source, city_id, part, **kwargs):
    try:
        result = hotel_list_database(source=source, city_id=city_id)
        data = []
        part = part.replace('list', 'detail')
        hotel_count = 0
        for sid, hotel_url in result['hotel']:
            hotel_count += 1
            if hotel_count >= 20:
                break
            worker = u'hotel_tax_detail'
            task_content = hotel_url.split('?')[0] + "?&1&20171210"
            args = json.dumps({
                u'task_content': unicode(task_content),
                u'city_id': unicode(city_id)
            })

            task_id = get_task_id(worker, args)
            data.append((task_id, worker, args, unicode(part)))

        update_task(kwargs['task_id'])
        print insert_task(data=data)
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))