示例#1
0
def GetData(tripType, orig, dest, deptDate, retDate):
    searchURL = "https://www.bookryanair.com/SkySales/Search.aspx"
    refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google"

    data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType,
                "SearchInput$Orig": orig,
                "SearchInput$Dest": dest,
                "SearchInput$DeptDate": deptDate,
                "SearchInput$RetDate": retDate,
                "SearchInput$IsFlexible": "on",
                "SearchInput$PaxTypeADT": 1,
                "SearchInput$PaxTypeCHD": 0,
                "SearchInput$PaxTypeINFANT": 0,
                "SearchInput$AcceptTerms": "on",
                "__EVENTTARGET": "SearchInput$ButtonSubmit",
                }

    #p = get_proxy()
    p = get_proxy(type = 'f')
    resp = request_post_data(searchURL, data, referer = refererURL, proxy = p)
    if resp == None or len(resp) == 0:
        #invalid_proxy(p)
        pass
    else:
        return resp
    return resp
示例#2
0
def smartfares_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight': flights, 'ticket': tickets}
    result['error'] = 0

    taskcontent = taskcontent.encode('utf-8')
    try:
        dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \
            taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \
            taskcontent.strip().split('&')[3]
    except:
        logger.error('smartfaresFlight::Wrong Content Format with %s' %
                     taskcontent)
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='smartfaresFlight')
    #p= None
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    try:
        search_url = get_search_url(dept_day, dest_day, dept_id, dest_id)
        content = crawl_single_page(search_url, proxy=p, referer=HOST)
        search_id = get_search_id(content)
        if search_id == '' or search_id == None:
            logger.error('smartfares::Parse search id failed')
            result['error'] = PROXY_INVALID
            return result
    except:
        logger.error('smartfares::Parse search id failed')
        result['error'] = PROXY_INVALID
        return result

    url_real = URL % search_id
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content_real = crawl_single_page(url=url_real,
                                         proxy=p,
                                         referer=search_url)
        content_len = len(content_real)
        i += 1

    if len(content_real) > 100:
        parser_result = parsePage(content_real)
        tickets = parser_result['ticket']
        flights = parser_result['flight']
        result['para'] = {'flight': flights, 'ticket': tickets}
        return result
    else:
        result['error'] = DATA_NONE
        return result
示例#3
0
def yelp_price_level(self, target_url, mid):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120)
        price_level = get_yelp_price_level(page)
        if not page.text:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
            print yelp_price_level_update_db((price_level, mid))
        return price_level
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#4
0
def booking_list_crawl(task):
    # 将任务进行拆分,拆分成该源上的城市中文名和城市id
    # eg :黄石国家公园西门&6406&region , 大雾山国家公园&255516&landmark
    # eg: 福森&-1773182
    # 任务类型, city, region, landmark
    city_name_zh, source_city_id, search_type = task.content.encode(
        'utf8').split('&')

    # 对城市中文名进行编码
    city_name_zh = urllib.quote(city_name_zh)

    check_in_year = task.check_in[0:7]
    check_in_day = task.check_in[8:]
    check_out_year = task.check_out[0:7]
    check_out_day = task.check_out[8:]

    # 对首页url进行拼接
    # url = get_search_url(check_in, check_out, source_city_id, city_name_zh, 1)
    # 注意!!!!!!大部分抓的dest_type都是city,黄石国家公园西门是region, 大雾山国家公园大峡谷国家公园都是landmark

    Id = source_city_id
    dest_type = search_type
    destination = city_name_zh

    if is_alp(Id[0]):
        url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1&place_id=' + Id
    else:
        url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1'

    print url, '================='
    PROXY = get_proxy(source="Platform")
    headers = {'User-agent': GetUserAgent()}
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    page = requests.get(url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    content = page.text
    root = HTML.fromstring(content)
    hotel = root.xpath('//*[@class="sr_header "]/h1/text()')[0].encode(
        'utf-8').replace(',', '').strip()
    # print hotel
    # 获取酒店数,获取的当前时间内有空房的酒店数
    # 有两个数时取后面的数
    temp_count = hotelcount_pat.findall(hotel)
    hotel_count = temp_count[-1]
    crawl_page = int(hotel_count) / 15 + 1
    # todo data crawl
    # 对首页进行数据爬取
    # parse_each_page(page, city_id, continent)

    result = list()
    result.append(url)
    # 开始进行翻页
    for page_index in range(1, crawl_page):
        offset = 14 + (page_index - 1) * 15
        each_page_url = get_search_url(task.check_in, task.check_out,
                                       source_city_id, city_name_zh, offset,
                                       search_type)
        result.append(each_page_url)

    return result
示例#5
0
def tp_rest_detail_page_url(self, page_num_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(page_num_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(page_num_url)

    data = []
    worker = u'daodao_poi_base_data'

    for item in doc('.property_title').items():
        href = item.attr.href
        if 'Restaurant_Review' in href:
            args = json.dumps(
                {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'})
            task_id = get_task_id(worker, args=args)
            data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail')))
    print insert_task(data=data)
示例#6
0
def tp_rest_list_page_num(self, index_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(index_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    page.encoding = 'utf8'
    doc = PyQuery(page.text)
    doc.make_links_absolute(index_url)
    num_list = []
    for item in doc('.pageNumbers a').items():
        num = int(rest_oa_pattern.findall(item.attr.href)[0])
        num_list.append(num)

    tp_rest_detail_page_url.delay(index_url, city_id, part)
    try:
        for page_num in range(30, max(num_list) + 30, 30):
            g_num = rest_g_pattern.findall(index_url)[0]
            tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)),
                                          city_id, part)
    except:
        pass
def qyer_country_spider(self, country_id, country_link, debug=False, **kwargs):
    """
    抓取穷游上的城市数据
    country_id:
        int, index country info
    country_en:
        str. country_en
    country_link:
        str.
    """
    http_tools = init_qyer_session(debug=True)
    x = time.time()
    spider_proxy = "socks5://" + get_proxy(source="Platform")
    qyer_db = QyerModel(**save_db_config)

    try:
        spider_ret = http_tools(country_link, proxy=spider_proxy)
        status_code = spider_ret[1]
        if status_code != 200 and status_code != 404:
            raise Exception(str(status_code))

        page_html = etree.HTML(spider_ret[0])
        country_max_page = find_max_page(page_html)
        save_data = [country_max_page, country_id]
        qyer_db.update_country_page(save_data)
        update_task(kwargs['task_id'])
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#8
0
def hotel_base_data(self, source, url, other_info, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        page = requests.get(url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        content = page.text
        # agoda 特殊情况 start
        url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format(
            other_info['source_id'])
        page_about = requests.get(url=url_about, headers=headers)
        page_about.encoding = 'utf8'
        about_content = page_about.text
        other_info['about_content'] = about_content

        # agoda end
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#9
0
def GetData(tripType, orig, dest, deptDate, retDate):
    searchURL = "https://www.bookryanair.com/SkySales/Search.aspx"
    refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google"

    data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType,
                "SearchInput$Orig": orig,
                "SearchInput$Dest": dest,
                "SearchInput$DeptDate": deptDate,
                "SearchInput$RetDate": retDate,
                "SearchInput$IsFlexible": "on",
                "SearchInput$PaxTypeADT": 1,
                "SearchInput$PaxTypeCHD": 0,
                "SearchInput$PaxTypeINFANT": 0,
                "SearchInput$AcceptTerms": "on",
                "__EVENTTARGET": "SearchInput$ButtonSubmit",
                }

    # 如果抓起失败,换一个代理IP,然后重试
    for i in range(3):
        p = get_proxy()
        resp = request_post_data(searchURL, data, referer = refererURL, proxy = p)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    return resp
示例#10
0
def airfrance_parser(postData, dept_city, dest_city, year, month, day):

    tickets = []
    flights = {}
    result = {}
    result['para'] = {'ticket': tickets, 'flight': flights}
    result['error'] = 0

    mc = MC()
    mc.set_debug(True)

    p = get_proxy(source='airfranceFlight')
    result['proxy'] = p
    if p == None or p == '':
        result['error'] = PROXY_NONE
        return result

    #mc.set_proxy(p)
    try:
        url0 = 'http://www.airfrance.com.cn/'
        page0 = mc.req('get', url0, html_flag=True)

        mc.add_referer(RefererURL_0)
        url1 = SearchURL_0
        page1 = mc.req('post', url1, postData, paras_type=2, html_flag=True)

        mc.add_referer(RefererURL)
        url2 = SearchURL
        page2 = mc.req('post', url2, postData, paras_type=2, html_flag=True)

    except Exception, e:
        result['error'] = PROXY_INVALID
        return result
示例#11
0
def venere_comment(self, target_url, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120)
        page.encoding = 'utf8'
        result = venere_comment_parser(page.text, target_url)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['mongo_task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')

        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#12
0
def get_cities(self, gid, country_id, offset):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}
    try:
        target_url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo={0}&offset={1}&desktop=true'.format(
            gid, offset)
        page = requests.get(target_url, headers=headers, proxies=proxies)
        page.encoding = 'utf8'
        content = page.text

        res = re.findall(
            'ta.store\(\'tourism.popularCitiesMaxPage\', \'(\d+)\'\);',
            content)

        has_next = False
        if res is not None and res != []:
            if offset < int(res[0]):
                has_next = True

        result = []
        for line in _parse_city(content=content, target_url=target_url):
            per_city = list(line)
            per_city.append(country_id)
            result.append(per_city)

        print insert_db(result)

        if has_next:
            get_cities.delay(gid, country_id, offset + 1)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#13
0
def GetData(tripType, orig, dest, deptDate, retDate):
    searchURL = "https://www.bookryanair.com/SkySales/Search.aspx"
    refererURL = "https://www.bookryanair.com/SkySales/booking.aspx?culture=en-gb&lc=en-gb&cmpid2=Google"

    data = {"fromaction": "Search.aspx", "SearchInput$TripType": tripType,
                "SearchInput$Orig": orig,
                "SearchInput$Dest": dest,
                "SearchInput$DeptDate": deptDate,
                "SearchInput$RetDate": retDate,
                "SearchInput$IsFlexible": "on",
                "SearchInput$PaxTypeADT": 1,
                "SearchInput$PaxTypeCHD": 0,
                "SearchInput$PaxTypeINFANT": 0,
                "SearchInput$AcceptTerms": "on",
                "__EVENTTARGET": "SearchInput$ButtonSubmit",
                }

    p = get_proxy()
    p = '221.181.104.11:8080'
    resp = request_post_data(searchURL, data, referer = refererURL, proxy = p,Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
    if resp == None or len(resp) == 0:
        #invalid_proxy(p)
        pass
    else:
        return resp
    return resp
示例#14
0
def tripadvisor_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#15
0
def elong_task_parser(content):

    contents = content.split('&')
    if len(contents) != 2:
        logger.error('elongFlight: wrong content format with %s' % content)
        return None
    location, origdate = contents[0].strip(), contents[1].strip()

    origday = datetime.datetime(string.atoi(origdate[0:4]),
                                string.atoi(origdate[4:6]),
                                string.atoi(origdate[6:]))
    urlday = (origday - datetime.datetime.today()).days
    dept_date = str(origday).split(' ')[0].strip()

    url = URL % (location, urlday)

    p = get_proxy()

    htmlcontent = crawl_single_page(url, proxy=p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error(
            'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p)
        return []

    #判断是否返回导航页,返回导航页说明content没有航班信息

    #判断是否找到航班信息,没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('elongFilght: Parser Error: cannot find flights with %s' %
                     location)
        return []

    flights = []

    flight_list = temp_flight_list[:-1]

    typ = 0
    for item in flight_list:
        typ = len(typePattern.findall(item))
        if typ == 0:
            pass
        elif typ != 1:
            transfer_info = transferFlight_parser(item, dept_date,
                                                  airports_dict)
            if transfer_info != []:
                flights.append(transfer_info)
        else:
            direct_info = directFlight_parser(item, dept_date, airports_dict)
            if direct_info != []:
                flights.append(direct_info)

    flights_set = set(flights)
    flights = [a for a in flights_set]
    #logger.info('Find %d airlines with %s'%(len(flights),location))

    return flights
示例#16
0
def smartfares_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight':flights, 'ticket':tickets}
    result['error'] = 0

    taskcontent = taskcontent.encode('utf-8')
    try:
        dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \
            taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \
            taskcontent.strip().split('&')[3]
    except:
        logger.error('smartfaresFlight::Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='smartfaresFlight')
    #p= None
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    try:
        search_url = get_search_url(dept_day,dest_day,dept_id,dest_id)
        content = crawl_single_page(search_url, proxy=p, referer=HOST)
        search_id = get_search_id(content)
        if search_id == '' or search_id == None:
            logger.error('smartfares::Parse search id failed')
            result['error'] = PROXY_INVALID
            return result
    except:
        logger.error('smartfares::Parse search id failed')
        result['error'] = PROXY_INVALID
        return result

    url_real = URL%search_id
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content_real = crawl_single_page(url=url_real, proxy=p, referer=search_url)
        content_len = len(content_real)
        i += 1

    if len(content_real) > 100:
        parser_result = parsePage(content_real)
        tickets = parser_result['ticket']
        flights = parser_result['flight']
        result['para'] = {'flight':flights, 'ticket':tickets}
        return result
    else:
        result['error'] = DATA_NONE
        return result
示例#17
0
def elong_task_parser(content):

    contents = content.split('&')
    if len(contents) != 2:
        logger.error('elongFlight: wrong content format with %s'%content)
        return None
    location, origdate = contents[0].strip(),contents[1].strip()
    
    origday = datetime.datetime(string.atoi(origdate[0:4]),string.atoi(origdate[4:6]),string.atoi(origdate[6:]))
    urlday = (origday - datetime.datetime.today()).days
    dept_date = str(origday).split(' ')[0].strip()
    
    url = URL%(location,urlday)

    p = get_proxy()

    htmlcontent = crawl_single_page(url,proxy = p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p)
        return []
    
    #判断是否返回导航页,返回导航页说明content没有航班信息
    
    #判断是否找到航班信息,没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('elongFilght: Parser Error: cannot find flights with %s'%location)
        return []

    flights = []

    flight_list = temp_flight_list[:-1]

    typ = 0
    for item in flight_list:
        typ = len(typePattern.findall(item))
        if typ == 0:
            pass
        elif typ != 1:
            transfer_info = transferFlight_parser(item,dept_date,airports_dict)
            if transfer_info != []:
                flights.append(transfer_info)
        else:
            direct_info = directFlight_parser(item,dept_date,airports_dict)
            if direct_info != []:
                flights.append(direct_info)
    
    flights_set = set(flights)
    flights = [a for a in flights_set]
    #logger.info('Find %d airlines with %s'%(len(flights),location))


    return flights
示例#18
0
def easyjet_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight':flights, 'ticket':tickets}
    result['error'] = 0
    try:
        dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \
                taskcontent.strip().split('&')[1], \
                taskcontent.strip().split('&')[2]
    except:
        logger.error('easyjet::Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    search_url = get_search_url(dept_id, dest_id, dept_day_temp)

    p = get_proxy(source='easyjet')
    
    time_zone_A = airport[dept_id]
    time_zone_B = airport[dest_id]
    #print p
    #print search_url
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    
    i = 0 
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST)
        content_len = len(content)
        i += 1
    
    if content == '' or content == None or len(content) < CONTENT_LEN:
        result['error'] = PROXY_INVALID
        return result

    para =  parsePage(content, p, time_zone_A, time_zone_B)
    
    if para == {'flight':{}, 'ticket':[]}:
        result['error'] = DATA_NONE
        return result
    else:
        flights = para['flight']
        tickets = para['ticket']
        result['para'] = {'ticket':tickets, 'flight':flights}
        return result
示例#19
0
def easyjet_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight': flights, 'ticket': tickets}
    result['error'] = 0
    try:
        dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \
                taskcontent.strip().split('&')[1], \
                taskcontent.strip().split('&')[2]
    except:
        logger.error('easyjet::Wrong Content Format with %s' % taskcontent)
        result['error'] = TASK_ERROR
        return result

    search_url = get_search_url(dept_id, dest_id, dept_day_temp)

    p = get_proxy(source='easyjet')

    time_zone_A = airport[dept_id]
    time_zone_B = airport[dest_id]
    #print p
    #print search_url
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST)
        content_len = len(content)
        i += 1

    if content == '' or content == None or len(content) < CONTENT_LEN:
        result['error'] = PROXY_INVALID
        return result

    para = parsePage(content, p, time_zone_A, time_zone_B)

    if para == {'flight': {}, 'ticket': []}:
        result['error'] = DATA_NONE
        return result
    else:
        flights = para['flight']
        tickets = para['ticket']
        result['para'] = {'ticket': tickets, 'flight': flights}
        return result
示例#20
0
 def wrapper(self, *args, **kw):
     if not self.flag:
         return func(self, *args, **kw)
     else:
         while self.times < 3:
             print "重试次数 %d " % self.times
             self.html, self.error = func(self, *args, **kw)
             if self.error == '':
                 break
             else:
                 p = get_proxy(source=self.source)
                 self.set_proxy(p)
                 self.times += 1
         return self.html, self.error
示例#21
0
def get_site_url(self, target_url, source_id, table_name):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    try:
        res = _get_site_url(target_url)
        if res == 'Error':
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
            update_site_url(res, source_id, table_name=table_name)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#22
0
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy = True):
    priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo
    
    #if use_proxy:
    # 如果抓起失败,换一个代理IP,然后重试,次数当前为0
    for i in range(1):
        p = get_proxy()
        resp = crawl_single_page(priceURL, referer=referer, proxy = p, cookie = cookie)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    #else:
        #resp = crawl_single_page(searchURL, cookie = cookie)

    return 
示例#23
0
def crawl(url):
    global PROXY
    mc = MC()
    #mc.set_debug(True)
    mc.set_proxy(PROXY)
    content = mc.req('get', url, html_flag = True,time_out=15)
    count = 0
    while len(content)<1000:
        invalid_proxy(PROXY,'Platform')
        PROXY = get_proxy(source = 'Platform')
        mc.set_proxy(PROXY)
        print 'proxy: %s' % PROXY
        content = mc.req('get', url, html_flag = True,time_out=15)
        count += 1
        if count > 10:
            break
    return content
示例#24
0
def _get_site_url(target_url):
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(target_url, proxies=proxies, headers=headers, allow_redirects=False)
    source_site_url = page.headers['location']
    print source_site_url
    # source_site_url = page.url
    if source_site_url != '' and source_site_url is not None:
        return source_site_url.replace('#_=_', '')
    else:
        return "Error"
示例#25
0
def crawl(url):
    global p
    mc = MC()
    #mc.set_debug(True)
    mc.set_proxy(p)
    print 'proxy:',p
    content = mc.req('get', url, html_flag = True,time_out=20)
    count = 0
    while len(content) < 2000:
        invalid_proxy(p,'Platform')
        p = get_proxy(source = 'Platform')
        mc.set_proxy(p)
        print p
        content = mc.req('get', url, html_flag = True,time_out=20)
        count += 1
        if count>5:
            break
    return content
示例#26
0
def crawl(url):
    global PROXY
    mc = MC()
    mc.set_proxy(PROXY)
    content = mc.req('get', url, html_flag = True)
    count = 0
    while len(content)<1000:
        invalid_proxy(PROXY,'Platform')
        PROXY = get_proxy(source = 'Platform')
        mc.set_proxy(PROXY)
        print 'proxy: %s' % PROXY
        content = mc.req('get', url, html_flag = True)
        count += 1
        if count > 10:
            break
    #open('test.html','w').write(content)
    #content = open('test.html','r').read()
    return content
示例#27
0
def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy=True):
    priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo

    #if use_proxy:
    # 如果抓起失败,换一个代理IP,然后重试,次数当前为0
    for i in range(1):
        p = get_proxy()
        resp = crawl_single_page(priceURL,
                                 referer=referer,
                                 proxy=p,
                                 cookie=cookie)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    #else:
    #resp = crawl_single_page(searchURL, cookie = cookie)

    return
示例#28
0
def detail_page(self, pid, page_num, city_id, part):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent(),
    }

    try:
        data = {
            u'page': unicode(page_num),
            u'type': u'city',
            u'pid': unicode(pid),
            u'sort': u'32',
            u'subsort': u'all',
            u'isnominate': u'-1',
            u'haslastm': u'false',
            u'rank': u'6'
        }
        json_page = requests.post(u'http://place.qyer.com/poi.php?action=list_json', data=data, proxies=proxies,
                                  headers=headers)
        json_page.encoding = u'utf8'
        content = json_page.text
        j_data = json.loads(content)
        task_data = []
        url_result = []
        for attr in j_data[u'data'][u'list']:
            worker = u'qyer_poi_task'
            args = json.dumps(
                {u'target_url': unicode(u'http:' + attr[u'url']), u'city_id': unicode(city_id)})
            task_id = get_task_id(worker=worker, args=args)
            task_data.append((task_id, worker, args, unicode(part.replace('list', 'detail'))))
            url_result.append(u'http' + attr[u'url'])
        result = insert_task(data=task_data)
        print result
        print url_result
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#29
0
def vote(self):
    import httplib
    httplib.HTTPConnection.debuglevel = 1
    httplib.HTTPSConnection.debuglevel = 1
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent(),
        'Referer': 'http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5',
        'Host': 'www.travelmeetingsawards-china.com',
        'Origin': 'http://www.travelmeetingsawards-china.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        # 'Cookie': 'EktGUID=91ea164d-e2c6-4748-8e31-33c05e6e5439; EkAnalytics=0; ASP.NET_SessionId=piy2livrdw4nb4vulygiet4y; awardvotes=[{"AwardEventID":7,"AwardCategoryID":5,"AwardSubCategoryID":98,"Datetime":"\/Date(1492764048212)\/"}]; s_cc=true; s_nr=1492766246608-New; _ga=GA1.2.1289463038.1492764050; _gat=1; ecm=user_id=0&isMembershipUser=0&site_id=&username=&new_site=/&unique_id=0&site_preview=0&langvalue=0&DefaultLanguage=2052&NavLanguage=2052&LastValidLanguageID=2052&DefaultCurrency=840&SiteCurrency=840&ContType=&UserCulture=1033&dm=www.travelmeetingsawards-china.com&SiteLanguage=2052; s_sq=ntmntmmcchina%3D%2526pid%253D(5105)%252520%2525E8%2525AF%2525BB%2525E8%252580%252585%2525E6%25258A%252595%2525E7%2525A5%2525A8%252520-%2525202017%2525E4%2525B8%2525AD%2525E5%25259B%2525BD%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E4%2525B8%25259A%2525E7%252595%25258C%2525E5%2525A5%252596%2525EF%2525BC%252588%2525E5%252595%252586%2525E5%25258A%2525A1%2525E7%2525B1%2525BB%2525EF%2525BC%252589%2525E8%2525AF%252584%2525E9%252580%252589%252520%25257C%2526pidt%253D1%2526oid%253DVote%252520%2525E6%25258A%252595%2525E7%2525A5%2525A8%2526oidt%253D3%2526ot%253DSUBMIT'
    }

    # data = {
    #   '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=',
    #    '__VIEWSTATEGENERATOR': 'C57773B4',
    #    '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=',
    #    'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl02$btnVote': 'Vote 投票'
    # }
    data = {
        '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=',
        '__VIEWSTATEGENERATOR': 'C57773B4',
        '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=',
        'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl00$btnVote': 'Vote 投票'
    }
    session = requests.session()
    session.proxies = proxies
    session.headers.update(headers)
    ip_page = requests.get('https://api.ipify.org?format=json', proxies=proxies)
    out_ip = json.loads(ip_page.text)['ip']
    page = session.get('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5')
    page = session.post('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5',
                        data=data)
    save_ip(out_ip, PROXY)
    return out_ip
示例#30
0
def tp_rest_city_page(self, city_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(city_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(city_url)
    for item in doc('.restaurants.twoLines a').items():
        tp_rest_list_page_num.delay(item.attr.href, city_id, part)
示例#31
0
def get_long_comment(self, target_url, language, miaoji_id, special_str):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, headers=headers, proxies=proxies, timeout=120)
        page.encoding = 'utf8'
        data = long_comment_parse(page.content, target_url, language, miaoji_id)
        update_proxy('Platform', PROXY, x, '0')
        print "Success with " + PROXY + ' CODE 0'
        return insert_db((data,), 'tp_comment_' + special_str)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#32
0
def get_comment(self, target_url, language, miaoji_id, special_str, **kwargs):
    if language == 'en':
        data = {
            'mode': 'filterReviews',
            'filterLang': 'en'
        }
    elif language == 'zhCN':
        data = {
            'mode': 'filterReviews',
            'filterLang': 'zh_CN'
        }
    else:
        return "Error, no such language"

    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    if data != '':
        try:
            page = requests.post(target_url, data, headers=headers, proxies=proxies, timeout=120)
            page.encoding = 'utf8'
            res = parse(page.text, target_url, language, miaoji_id, special_str)
            if res == 0:
                update_proxy('Platform', PROXY, x, '23')
                self.retry(countdown=120)
            else:
                # update_task(kwargs['mongo_task_id'])
                update_proxy('Platform', PROXY, x, '0')
                print "Success with " + PROXY + ' CODE 0'
        except Exception as exc:
            update_proxy('Platform', PROXY, x, '23')
            self.retry(exc=traceback.format_exc(exc), countdown=120)
示例#33
0
def get_lost_rest_new(self, target_url, city_id, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, proxies=proxies, timeout=15)
        page.encoding = 'utf8'
        result = rest_parser(page.content, target_url, city_id)
        if result == 'Error':
            self.retry()
        else:
            update_task(task_id=kwargs['mongo_task_id'])
            update_proxy('Platform', PROXY, x, '23')
        return result
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
示例#34
0
def shutter_spider(self, vid, search_kw, debug=False, **kwargs):
    """
    shutterstock 图片搜索爬取
    """
    if search_kw is None or search_kw == "null":
        # todo logging null key words
        return None
    x = time.time()
    spider_proxy = 'socks5://' + get_proxy(source="Platform")
    try:
        spider = ShutterShockPicSpider(search_kw, spider_proxy, debug)
        pic_ret = spider.pic_search()
        pic_save_data = shutter_pic_data_assembly(vid, search_kw, pic_ret)
        spider_db = PicModel(**save_db_config)
        for _, save_data_map in pic_save_data.items():
            spider_db.insert_pic_many(save_data_map["table"],
                                      save_data_map["fields"],
                                      save_data_map["values"])
        update_task(kwargs['task_id'])
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#35
0
def get_pid_total_page(self, target_url, city_id, part):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        html_page = requests.get(target_url, proxies=proxies, headers=headers)
        html_page.encoding = u'utf8'
        content = html_page.text
        pid = re.findall(u'PID :\'(\d+)\'', content)[0]
        total_attr = re.findall(u'景点\((\d+)\)', content)[0]
        # return pid, (int(total_attr) // 15) + 1
        print pid, total_attr
        for page_num in range(1, (int(total_attr) // 15) + 2):
            detail_page.delay(pid, page_num, city_id, part)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
def qyer_city_spider(self,
                     country_id,
                     country_en,
                     country_link,
                     debug=False,
                     **kwargs):
    """
    抓取穷游上的城市数据
    country_id:
        int, index country info
    country_en:
        str. country_en
    country_link:
        str.
    """
    if country_en in city_state:
        country_type = "city_state"
    else:
        country_type = "city_list"
    http_tools = init_qyer_session(debug=True)
    x = time.time()
    country_args = {"country_en": country_en, "country_id": country_id}
    spider_proxy = "socks5://" + get_proxy(source="Platform")
    qyer_db = QyerModel(**save_db_config)

    try:
        spider_ret = http_tools(country_link, proxy=spider_proxy)
        status_code = spider_ret[1]
        if status_code != 200 and status_code != 404:
            raise Exception(str(status_code))

        save_data = platform_page_parse(country_type, spider_ret[0],
                                        **country_args)
        qyer_db.insert_many_data(*save_data)
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#37
0
def qyer_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {
        'User-agent': GetUserAgent(),
        'Referer': "http://www.qyer.com/",
    }

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#38
0
def get_daodao_image_url(self, source_url, mid, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        detail_id = re.findall('-d(\d+)', source_url)[0]
        target_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' + detail_id
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        if not page.text:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            root = PyQuery(page.text)
            images_list = []
            for div in root('.photos.inHeroList div').items():
                images_list.append(div.attr['data-bigurl'])
            img_list = '|'.join(images_list)
            if img_list == '':
                self.retry()
            data = (mid, source_url, img_list)
            print insert_daodao_image_list(data)
            update_proxy('Platform', PROXY, x, '0')
            update_task(kwargs['mongo_task_id'])
        return data
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
示例#39
0
def youzhan_task_parser(taskcontent):
    all_info = []
    room_list = []
    taskcontent = taskcontent.encode('utf-8').strip()
    hotel_id = taskcontent.split('&')[0]
    star = taskcontent.split('&')[2]
    ipathid = taskcontent.split('&')[1]
    city = taskcontent.split('&')[3]
    country = taskcontent.split('&')[4]
    #room_type = taskcontent.split('&')[3]
    from_date_temp = taskcontent.split('&')[5]
    from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \
                + from_date_temp[6:]
    to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                     int(from_date_temp[6:]))
    to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10]

    #获取代理
    
    p = get_proxy()

    #if p == "":
        #logger.error("get proxy failed")
        #return None
    
    hotel = Hotel()
    room = Room()

    rating_url = get_rating_url(hotel_id)
    rating_page = crawl_single_page(rating_url, proxy=p)
    
    grade_str = grade_parser(rating_page)
    
    if grade_str != '':
        hotel.grade = grade_str[:-1]
    else:
        pass
        #logger.error('Error: No grade_str found!')

    map_url = get_map_url(hotel_id)
    map_page = crawl_single_page(map_url, proxy=p)
    #print map_page
    map_info_list = staticmap_parser(map_page)
    if map_info_list != []:
        hotel.hotel_name = map_info_list[1]
        if is_alphabet(hotel.hotel_name.decode('utf-8')) == True:
            hotel.hotel_name_en = hotel.hotel_name
        else:
            hotel.hotel_name_en = 'NULL'
        hotel.map_info = map_info_list[0]
    else:
        logger.error('youzhanHotel: Map info do not have hotel name and map_info')
        return []

    info_url = get_info_url(hotel_id,from_date,to_date)
    info_page = crawl_single_page(info_url,proxy=p)
    if info_page == '':
        #invalid_proxy(p)
        return []
    info_list = info_parser(info_page)

    if info_list != []:
        hotel.country = country
        hotel.city = city
        hotel.address = info_list[1]
        hotel_desc_temp = info_list[3].replace('&lt;br/&gt;','').replace('&#039;','')
        if hotel_desc_temp != '':
            hotel.description = hotel_desc_temp
        else:
            hotel.description = 'NULL'
        hotel.service = info_list[4]

        if '停车场' in hotel.service:
            hotel.has_parking = 'Yes'
        if '无线网络' in hotel.service or 'wifi' in hotel.service:
            hotel.has_wifi = 'Yes'
    else:
        return []

    hotel.source = 'youzhan'
    hotel.source_id = hotel_id
    hotel.star = star

    price_url = get_price_url(hotel_id,ipathid,from_date,to_date)
    price_page = crawl_single_page(price_url,proxy=p)
    price_list = price_parser(price_page,hotel_id)
    #print '********'
    #print price_list
    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2
                room.hotel_name = hotel.hotel_name
                #print '******'
                #print each_room
                room.room_desc = each_room[3]
                room.real_source = each_room[2]
                

                num = each_room[3].find('-')
                if num > 0:
                    if len(each_room[3][:num]) < 20:
                        room.room_type = each_room[3][:num]
                    else:
                        room.room_type = 'NULL'
                else:
                    if len(each_room[3]) < 20:
                        room.room_type = each_room[3]
                    else:
                        room.room_type = 'NULL'
            
                if each_room[0] != u'nbsp;':
                    room.price = each_room[0]
                room.has_breakfast = each_room[1]
                room.room_desc = each_room[3]

                if '免费WiFi' in room.room_desc:
                    hotel.is_wifi_free = 'Yes'
                
                if '免费取消' in room.room_desc:
                    hotel.is_cancel_free = 'Yes'

                room.currency = 'CNY'
                room.source = 'youzhan'
                room.source_hotelid = hotel_id
                room.check_in = from_date
                room.check_out = to_date

                room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\
                    room.source_roomid,room.real_source,room.room_type,room.occupancy,\
                    room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\
                    room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\
                    room.is_breakfast_free,room.is_cancel_free,room.room_desc)
                room_list.append(room_tuple)

    hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\
        hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \
        hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\
        hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description)
    hotel_list = []
    hotel_list.append(hotel_tuple)
    all_info.append(hotel_list)
    all_info.append(room_list)

    return all_info
示例#40
0
        dept_id = infos[0]
        dest_id = infos[1]
        day, month, year = infos[2][6:], infos[2][4:6], infos[2][0:4]
        dept_date = month + '/' + day + '/' + year
        rday, rmonth, ryear = infos[3][6:], infos[3][4:6], infos[3][0:4]
        dest_date = rmonth + '/' + rday + '/' + ryear

    except Exception, e:
        logger.error('feiquanqiuRoundFlight: Wrong Content Format with %s'%content)
        result['error'] = TASK_ERROR
        return result

    url = URL%(dept_id, dest_id, dept_date, dest_date)
    referer = REFERER%(dept_id, dest_id, dept_date, dest_date)

    p = get_proxy(source='feiquanqiuRoundFlight')
    if p == None:
        result['error'] = PROXY_NONE
        return result
    
    mc = MechanizeCrawler(p=p, referer=referer)

    page = mc.get(url, html_flag = True)
    if page == None:
        logger.info('feiquanqiuRoundFlight: htmlcontent is null with %s'%p)
        result['error'] = PROXY_INVALID
        return result

    flights = parsePage(page)
    if flights == None:
        result['error'] = DATA_NONE
示例#41
0
        dest_id = infos[1]  #机场三字码
        day, month, year = infos[2][6:], infos[2][4:6], infos[2][0:4]
        dept_date = year+'-'+month+'-'+day
        dept_date_url = year[-2:] + month + day #140627

    except Exception, e:
        logger.error('ceairFlight: Wrong Content Format with %s'%content)
        result['error'] = TASK_ERROR
        return result

    if AIRPORT_CITY_DICT.has_key(dept_id) == False or AIRPORT_CITY_DICT.has_key(dest_id) == False:
        logger.warning('ceairFlight: airport not in AIRPORT_CITY_DICT')
        result['error'] = DATA_NONE
        return result

    p = get_proxy(source = 'ceairFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    postdata = getPostData(dept_id,dest_id,dept_date)

    if postdata == '':
        result['error'] = UNKNOWN_TYPE
        return result
    
    rand = str(random.random())
    referer = RefererURL%(AIRPORT_CITY_DICT[dept_id].lower(), AIRPORT_CITY_DICT[dest_id].lower(), dept_date_url)
    searchurl = SearchURL%str(rand)
示例#42
0
    #解析taskcontent 中的出发城市和到达城市的三字码以及出发日期
    try:
        dept_code, dest_code, dept_date = taskcontent.strip().split('&')[0], \
                taskcontent.strip().split('&')[1], \
                taskcontent.strip().split('&')[2]

        dept_day = dept_date[:4] + '-' + dept_date[4:6] + '-' + dept_date[6:]
        dept_year= dept_date[:4]
    except Exception,e:
        logger.error('feifanFlight: wrong content format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result
    
    #获取代理
    p = get_proxy(source='feifanFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    #生成URL并判断其是否可用
    url = get_url(dept_code, dest_code, dept_day)    
    
    if url == '' or url == None:
        logger.error('feifanFlight: Get url failed!')
        result['error'] = UNKNOWN_TYPE
        return result

    #抓取页面并判断其是否可用
    #feifan常常要刷新才能获取内容,所以爬取3次
示例#43
0
        hotel_name = taskcontent.strip().split('&')[1]
        map_info = taskcontent.strip().split('&')[2]
        city_name_zh = taskcontent.strip().split('&')[3]
        city_name_en = taskcontent.strip().split('&')[4]
        country_name_zh = taskcontent.strip().split('&')[5]
        check_in_day_temp = taskcontent.strip().split('&')[6]
        check_in_day = check_in_day_temp[:4] + '-' + check_in_day_temp[4:6] + '-' + check_in_day_temp[6:]
        check_out_day_temp = datetime.datetime(int(check_in_day_temp[:4]),int(check_in_day_temp[4:6]), int(check_in_day_temp[6:]))
        check_out_day = str(check_out_day_temp  + datetime.timedelta(days = 1))[:10]

    except Exception, e:
        logger.error('biyiHotel: Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result
   
    p = get_proxy(source='biyiHotel')
    print p
    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = 'http://www.biyi.cn/'
    url = get_url(hotel_name, city_name_en, check_in_day, check_out_day)
    
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(url=first_url, proxy=p, Accept=accept, referer=first_url, n=1)
    #for x in cj:
    #    print x
示例#44
0
    result['error'] = 0

    #解析字符串
    content = content.encode('UTF-8').strip()
    try:
        info = content.split('&')
        dept_id = info[0]
        arr_id = info[1]
        dept_date = info[2][:4] + '-' + info[2][4:6] + '-' + info[2][6:]
    except Exception,e:
        logger.error('wegoFlight Content Error: cannot extract information from %s'%content)
        result['error'] = TASK_ERROR
        return result

    #获取代理
    p = get_proxy(source = 'wegoFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result
    
    #获取初始url
    url_temp = get_url(dept_id,arr_id,dept_date)
    search_id = get_search_id(url_temp,proxy = p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_FORBIDDEN
        return result
示例#45
0
                taskcontent.split('&&')[2], taskcontent.split('&&')[3], \
                taskcontent.split('&&')[4]
    except Exception, e:
        logger.error('haodingHotel::Cannot parse task content with error: ' + str(e))
        return {'para':[], 'error':TASK_ERROR}

    check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + check_in_temp[6:]
    check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \
            int(check_in_temp[6:]))
    check_out = str(check_out_temp + datetime.timedelta(days=1))[:10]

    hotel_url = get_hotel_url(city_name_zh,city_id,hotel_id,check_in,check_out)
    
    #p = get_proxy()
    #print p
    p = get_proxy(source='haodingHotel')
    if p == '' or p == None:
        return {'para':[], 'error':NO_PROXY}
    
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(hotel_url, p)
        content_len = len(content)
        i += 1

    if content == '' or content == None:
        invalid_proxy(proxy = p, source='haodingHotel')
        return {'para':[], 'error':NO_CONTENT}

    if len(content) < CONTENT_LEN:
示例#46
0
    taskcontent.encode('utf-8')
    try:
        dept_city_zh,dept_city_en,dest_city_zh,dest_city_en,dept_day_temp = \
                taskcontent.strip().split('&')[0],  \
                taskcontent.strip().split('&')[1],  \
                taskcontent.strip().split('&')[2],  \
                taskcontent.strip().split('&')[3],  \
                taskcontent.strip().split('&')[4]
        dept_day = dept_day_temp[:4] + '-' + dept_day_temp[4:6] + '-' + dept_day_temp[6:]
    except Exception,e:
        logger.error('jijitongFlight:Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result
    
    p = get_proxy(source='jijitongFlight')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = FIRST_URL % (dept_city_en,dest_city_en,dept_day_temp)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(first_url,proxy=p, \
         Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', n = 1)

    if resp.find('404错误') < 0:
        url = get_url(dept_city_zh,dest_city_zh,dept_day)
        page = crawl_single_page(url, proxy = p, referer = first_url)
示例#47
0
def csair_task_parser(taskcontent):
  result = {}
  multi_ticket = []
  one_flight = {}
  result['para'] = {'flight':one_flight, 'ticket':multi_ticket}
  result['error'] = 0
  try:
      param_list = taskcontent.strip().split('&')
      url= 'http://b2c.csair.com/B2C40/detail-'+param_list[0]+param_list[1]+'-'+param_list[2]\
    +'-1-0-0-0-1-0-1-0-1-0.g2c'
  except:
      logger.info('url param is not valid\n')
      result['error'] = TASK_ERROR
      return result
  #Initial all params
  dic_flightdate = {}
  multi_price = []
  select_time = 0
  Flag1 = False
  Flag2 = False
  page_flag = False
  cj = cookielib.CookieJar()
  opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  urllib2.install_opener(opener)
  task_content_proxy = get_proxy(source='csairFlight')
  if task_content_proxy == None:
     result['error'] = PROXY_NONE
     return result
  html = crawl_single_page(url, proxy = task_content_proxy)
  if html == '' or html == None:
    result['error'] = PROXY_INVALID
    return result
  pattern = re.compile(r'\s*<FLIGHTS>\s*')
  match = pattern.search(html)
  if match and len(html) > CONTENT_LEN:
    dom =etree.fromstring(html)
    etree.tostring(dom)
    for ele in dom.iter():
         if ele.tag is not None:
           if ele.tag in  word_flightdate:
              #print ele.tag, ele.text
              dic_flightdate[ele.tag] = ele.text
           elif ele.tag in  word_parent_list:
            page_flag = True #node of DateFIGHT
            multi_flight = []
            Flight = nanhang_flight()
            select_time += 1
            flight_num = 0
            ticket_dur_list = []
            for word in ele:
              if word.tag in word_list[0]:             
                flight_num += 1
                dic_flight = {}
                EachFlight = nanhang_eachflight()
                for word_child in word:
                  if word_child.tag  in word_child_list[0]:
                    Flag1 = True
                    dic_flight[word_child.tag]= word_child.text #each flight
                if Flag1 == True:
                    try:
                       Flag1 = False
                       EachFlight.flight_no = dic_flight[word_child_list[0][0]]
                       EachFlight.dept_id = dic_flight[word_child_list[0][1]]
                       EachFlight.dest_id = dic_flight[word_child_list[0][2]]
                       EachFlight.flight_key = EachFlight.flight_no + '_' + EachFlight.dept_id + '_'+ EachFlight.dest_id
                       dept_time = dic_flight[word_child_list[0][3]]
                       EachFlight.dept_time = dept_time[0:10] +'T'+dept_time[-5:len(dept_time)]
                       dest_time = dic_flight[word_child_list[0][4]]
                       EachFlight.dest_time = dest_time[0:10] +'T'+dest_time[-5:len(dest_time)]
                       EachFlight.dur = get_duration(dest_time,EachFlight.dest_id, dept_time,EachFlight.dept_id)
                       EachFlight.dept_time = EachFlight.dept_time + ':00'
                       EachFlight.dest_time = EachFlight.dest_time+ ':00'
                       ticket_dur_list.append(EachFlight.dur)
                       EachFlight.airline = '南方航空公司'
                       EachFlight.plane_no =  dic_flight[word_child_list[0][5]]   # rebulid and compute flight
                    except KeyError,e:
                        print e
                    else:
                        one_flight[EachFlight.flight_key] = (EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur)
                        multi_flight.append((EachFlight.flight_key,EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur)) #list of multi flight
              elif word.tag in  word_list[1]:  
                multi_price = [] #node of price
                for word_child in word:
                  if word_child.tag in word_next_list:
                      dic_ticket = {}
                      for word_next_child in word_child:
                        if word_next_child.tag in word_child_list[1]:
                          Flag2 = True
                          dic_ticket[word_next_child.tag] = word_next_child.text
                      if Flag2 == True:
                          try:
                            Flag2 = False
                            Flight.price = string.atof(dic_ticket[word_child_list[1][0]])
                            Flight.tax = string.atof(dic_ticket[word_child_list[1][1]]) + string.atof(dic_ticket[word_child_list[1][2]]) + string.atof(dic_ticket[word_child_list[1][3]])
                            Flight.currency = dic_ticket[word_child_list[1][4]]
                            Flight.seat_type = dic_ticket[word_child_list[1][5]]
                            if Flight.seat_type == 'ECONOMY':
                              Flight.seat_type = '经济舱'
                            if Flight.seat_type =='BUSINESS':
                              Flight.seat_type = '商务舱'
                            if Flight.seat_type == 'FIRST':
                              Flight.seat_type = '头等舱'
                            if Flight.seat_type == 'PREMIUMECONOMY':
                              Flight.seat_type = '超经济舱'
                            Flight.return_rule = 'NULL'
                            Flight.stop = flight_num - 1
                            Flight.surcharge = -1
                            Flight.source = 'csair::csair'
                          except KeyError,e:
                              print e
                          else:
                              multi_price.append((Flight.price, Flight.tax, Flight.surcharge, Flight.currency,Flight.seat_type, Flight.source, Flight.return_rule, Flight.stop))
            if select_time is not 0:
               if multi_flight != []:
                 new_flight_no = []
                 Flight.fight_no = '_'.join([item[1] for item in multi_flight])
                 Flight.plane_no = '_'.join([item[3] for item in multi_flight])
                 Flight.airline = '_'.join([item[2]for item in multi_flight])
                 Flight.dept_id = multi_flight[0][4]
                 Flight.dest_id = multi_flight[len(multi_flight)-1][5]
                 Flight.dept_day = dic_flightdate[word_flightdate[0]][0:4]+'-'+ dic_flightdate[word_flightdate[0]][4:6]+'-'+dic_flightdate[word_flightdate[0]][6:8]
                 Flight.dept_time = multi_flight[0][6]
                 Flight.dest_time = multi_flight[len(multi_flight)-1][7]
                 Flight.dur = get_duration(Flight.dest_time,Flight.dest_id,Flight.dept_time,Flight.dept_id)
                 for i in range(len(multi_price)):
                   multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\
                     Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\
                     multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))
示例#48
0
 result['error'] = 0
 try:
     dept_city,dest_city = taskcontent.split('&')[0].strip(),taskcontent.split('&')[1].strip()
     dept_time = taskcontent.split('&')[2].strip()
     dept_time = dept_time[0:4] +'/'+ dept_time[4:6] + '/' + dept_time[6:8]
 except Exception,e:
     logger.info('url id wrong :'+e)
     result['error'] = TASK_ERROR
     return result
 cj = cookielib.CookieJar()
 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 urllib2.install_opener(opener)
 url_res = get_json_url(dept_city,dest_city,dept_time)
 if  url_res != False:
     url = 'http://www.expedia.com.hk/Flight-Search-Outbound?c='+ url_res +'&_='+str(time.time())
     task_content_proxy = get_proxy(source='expediaFlight')
     if task_content_proxy == None:
         result['error'] = PROXY_NONE
         return result
     html_res = crawl_single_page(url, proxy = task_content_proxy)
     if html_res == '' or html_res == None:
         result['error'] = PROXY_INVALID
         return result
 else:
     result['error'] = TASK_ERROR
     return result
 try:
     json_list = json.loads(html_res)
     if json_list[key_list[0]] == None:
         result['error'] = DATA_NONE
         return result
示例#49
0
        check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + check_in_temp[6:]
        check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), int(check_in_temp[6:]))
        check_out = str(check_out_temp + datetime.timedelta(days=1))[:10]
        hotel_id_temp = hotel_id.split('_')[1]

    except Exception, e:
        logger.error('elongHotelParser: Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result
        
    if hotel_id_temp == '0':
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='elongHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    post_data = get_post_data(hotel_id_temp, check_in, check_out)

    page = request_post_data(request_url,data=post_data,proxy=p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='elongHotel')
        result['error'] = PROXY_INVALID
        return result

    room_list = parseRoom(page,hotel_name,city_name_zh,check_in,check_out,hotel_id)

    if room_list != []:
示例#50
0
def crawl(city_url,city_id):
    global p
    source = 'daodao'
    #city_url = city_url.replace('Tourism','Restaurants')
    print city_url
    mc = MC()
    mc.set_proxy(p)
    print 'proxy:  %s' % p
    page1 = ''
    page1 = mc.req('get',city_url,html_flag=True, time_out=10)
    count =0
    while len(page1)<1000:
        invalid_proxy(p,'Platform')
        p = get_proxy(source='Platform')
        print 'proxy: %s' % p
        mc.set_proxy(p)
        page1 = mc.req('get',city_url,html_flag=True , time_out=10)
        count += 1
        if count > 20:
            break
    source_city_id = re.compile(r'-g(\d+)').findall(city_url)[0]
    root = html.fromstring(page1)

    # 城市餐厅总数
    rating_info = root.find_class('listing')[0].find_class('popIndexDefault')[0].xpath('text()')[0].encode('utf-8').strip().split('(')[1].replace(',','')
    nums = re.compile(r'(\d+)').findall(rating_info)
    res_total = int(nums[0])
    print "total: %s " % res_total

    # 第一页的餐厅列表
    items = root.find_class('listing')
    data_list = []
    for item in items:
        res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8')
        res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8')
        print res_url
        data = (source,city_id,res_id,res_url)
        print data
        data_list.append(data)
    print 'insert',insert_db(data_list)


    print '------------next page------------'
    itag = '10591' # 餐厅的类别id
    page = 2
    data_list = []
    for offset in range(30,res_total+1,30):
        print '-----------page %s-------' % page
        page += 1
        next_url = 'http://www.tripadvisor.cn/RestaurantSearch?Action=PAGE&geo=%s&ajax=1&itags=%s&sortOrder=popularity&o=a%s&availSearchEnabled=false' % (source_city_id,itag,offset)
        print next_url

        content2 = ''
        content2 = mc.req('get',next_url,html_flag = True)
        while (len(content2) < 1000):
            p = get_proxy(source='Platform')
            print 'proxy: %s' % p
            content2 = mc.req('get',next_url,html_flag = True)
        no_count = len( re.compile(r'(该餐馆暂无点评,来写第一条)').findall(content2) )
        # 如果大部分是“该餐馆暂无点评,来写第一条”,就停止翻页
        if int(no_count) >29:
            break
        root2 = html.fromstring(content2)
        items = root2.find_class('listing')
        data_list2 = []
        for item in items:
            res_url = 'http://www.tripadvisor.cn' + item.find_class('title')[0].xpath('a/@href')[0].strip().encode('utf-8')
            res_id = re.compile(r'd(\d+)').findall(res_url)[0].encode('utf-8')
            print res_url
            data2 = (source,city_id,res_id,res_url)
            print data2
            data_list2.append(data2)
        print 'insert',insert_db(data_list2)
    print 'city %s ok' % city_id
示例#51
0
POST_DATA_STRING = 'CONTROLGROUPAVAILABILTYSEARCHINPUTSCHEDULESELECTVIEW$AvailabilityScheduleSelectView$'


def vueling_task_parser(taskcontent):
    try:
        dept_id, dest_id, dept_date_temp = taskcontent.split('&')[0].strip(), \
                taskcontent.split('&')[1].strip(), \
                taskcontent.split('&')[2].strip()
    except Exception,e:
        logger.error('VuelingFlight: Content Error wrong content format' + str(e))
        return None
    dept_time = dept_date_temp[:4] + '-' + dept_date_temp[4:6] + '-' +dept_date_temp[6:]

    postdata = getPostData(dept_time,dept_id,dest_id)

    p = get_proxy(source ='vuelingFlight')

    url = 'http://tickets.vueling.com/ScheduleSelect.aspx'
    Referer = 'http://tickets.vueling.com/ScheduleSelect.aspx'

    content = request_post_data(url,postdata,referer=Referer,proxy=p,\
            Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")

    allinfos = []

    if content != '' and len(content) > 100:
        allinfos = vuelingparser(content)
    else:
        logger.error('VuelingFlight: Get web content failed!')

    return allinfos
示例#52
0
        dept_id = cities_dict[dept_id]
        dest_id = cities_dict[dest_id]
        location = dept_id +  '-' + dest_id

        origday = datetime.datetime(string.atoi(dept_date[0:4]),string.atoi(dept_date[5:7]),string.atoi(dept_date[8:]))
        urlday = (origday - datetime.datetime.today()).days
        #dept_date = orig_date
        #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday)))
    except Exception,e:
        logger.error(str(e))
        logger.error('Content Error: Wrong content format with %s'%content)
        return result
    
    url = URL%(location,urlday)

    p = get_proxy(source='elongFlight')

    htmlcontent = crawl_single_page(url,n=1,proxy = p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p)
        return result
    
    #判断是否返回导航页,返回导航页说明content没有航班信息
    
    #判断是否找到航班信息,没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('Parser Error: cannot find flights with %s'%location)
        return result
示例#53
0
        logger.error( "pageParser Error: %s" %str(e))
    return page_num

def wego_task_parser(content):
    content = content.encode('UTF-8').strip()
    try:
        info = content.split('&')
        dept_id = info[0]
        arr_id = info[1]
        dept_date = info[2][:4] + '-' + info[2][4:6] + '-' + info[2][6:]
    except Exception,e:
        logger.error('wegoFlight Content Error: cannot extract information from %s'%content)
        return None

    #获取代理
    p = get_proxy(type = '')
    
    #获取初始url
    url_temp = get_url(dept_id,arr_id,dept_date)
    search_id = get_search_id(url_temp,proxy = p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        return None

    trip_id = get_trip_id(dept_id,arr_id,dept_date)
    
    #使用初始url,获取要爬取的页面,page表示一共有多少页
    start_url = get_start_url(search_id,trip_id)
    content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
    if content_temp == "":
示例#54
0
        dest_id = infos[1]  #机场三字码
        dept_day = infos[2]
        dept_date = dept_day[0:4] + '-' + dept_day[4:6] + '-' + dept_day[6:]
    except Exception, e:
        logger.error('lcairFlight: Wrong Content Format with %s'%content)
        result['error'] = TASK_ERROR
        return result

    if AIRPORT_CITY_DICT.has_key(dept_id) == False or AIRPORT_CITY_DICT.has_key(dest_id) == False:
        logger.warning('ceairFlight: airport not in AIRPORT_CITY_DICT')
        logger.info(dept_id)
        logger.info(dest_id)
        result['error'] = DATA_NONE
        return result

    p = get_proxy(source = 'lcairFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result
    
    postdata = getPostData(dept_id, dest_id, dept_date)

    if postdata == None:
        result['error'] = UNKNOWN_TYPE
        return result

    #referer = Referer%(AIRPORT_CITY_CN_DICT[dept_id], AIRPORT_CITY_DICT[dept_id], AIRPORT_CITY_CN_DICT[dest_id], AIRPORT_CITY_DICT[dest_id], dept_date)

    uc = UrllibCrawler(p = p)
    #uc.get(referer)
示例#55
0
    trip_way = 'Oneway'

    searchURL = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DCity1=%s&ACity1=%s&DDate1=%s&passengerQuantity=1&SendTicketCity=undefined&PassengerType=ADU&SearchType=S&RouteIndex=1&RelDDate=&RelRDate="
    interSearchURL = "http://flights.ctrip.com/international/ShowFareFirst.aspx?flighttype=S&relddate=%s&dcity=%s&acity=%s"

    is_inter = False
    searcURL = ""
    if dept_id.lower() in CN_AIRPORTS and dest_id.lower() in CN_AIRPORTS:
        searchURL = searchURL %(dept_id, dest_id, dept_id, dest_id, dept_date)
    else:
        searchURL = interSearchURL %(dept_date, dept_id, dest_id)
        is_inter = True
    refererURL = "http://flights.ctrip.com/booking/"
    cookie = {}

    p = get_proxy()
    resp = crawl_single_page(searchURL, proxy = p, cookie = cookie)
    if resp == None or len(resp) == 0:
        invalid_proxy(p)
        return None

    # 2. 解析页面
    tree = etree.HTML(resp)
    if is_inter or GetTextByXpath(tree, "//title/text()").endswith("携程国际机票"):
        # 国际机票
        queryLogTransNo = tree.xpath("//input[@id='queryLogTransNo']")[0].get("value")
        # 抓取机票价格页面 
        resp = GetInterPricePage(queryLogTransNo, cookie, searchURL)#, use_proxy)
        return ParseInterPage(resp)

    else:   # 国内机票
示例#56
0
reload(sys)
sys.setdefaultencoding('utf-8')

# -------------------------------

# 需要更新的表
BASIC_TABLE = 'qyer'
# 链接任务表格
TASK_TABLE = 'qyer'
# 是否入库,True为入库,False为不入库
IS_INSERT = True
# 调试,如果为True仅调试一个URL就break
DEBUG = True
# ---------------------------------

PROXY = get_proxy(source="Platform")


def get_task():
    #sql = "select url from "+TASK_TABLE+" where map_info is null "
    #sql = "select * from "+TASK_TABLE
    sql = "select url from qyer where cateid='景点观光' and map_info is null"
    return db_add.QueryBySQL(sql)


class QyerParser():

    def __init__(self):
        pass

    def crawl(self,url):
示例#57
0
        to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                         int(from_date_temp[6:8]))
        to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10]
    except Exception,e:
        logger.info('youzhanHotel: Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    room = Room()

    price_url = get_price_url(hotel_id,ipathid,from_date,to_date)
    i = 0
    content_len = 0
    while i < 5 and content_len < CONTENT_LEN:
        #p = get_proxy()
        p = get_proxy(source='youzhanHotel')
        #print p
        if p == None:
            result['error'] = PROXY_NONE
            return result

        url = price_url + str(int(time.time() * 1000))
        price_page = crawl_single_page(url,proxy=p,n=1)
        content_len = len(price_page)
        i += 1

    if price_page == None or price_page == '':
        invalid_proxy(proxy=p, source='youzhanHotel')
        result['error'] = PROXY_INVALID
        return result
    #print price_page
示例#58
0
    result = {}
    result['para'] = None
    result['error'] = 0

    try:
        contents = content.split('&')
        dept_id = contents[0]
        dest_id = contents[1]
        dept_date = contents[2][:4] + '-' + contents[2][4:6] + '-' + contents[2][6:]
        ret_date = str(datetime.datetime.strptime(dept_date[2:], '%y-%m-%d') + datetime.timedelta(10)).split(' ')[0].strip()#do not use this value
    except Exception,e:
        logger.error('ryanairFlight: wrong content format with %s'%content)
        result['error'] = TASK_ERROR
        return result
    
    p = get_proxy(source = 'ryanairFlight')

    if p == None:
        result['error'] = PROXY_NONE
        return result

    trip_type = 'Oneway'
    page = GetData(trip_type, dept_id, dest_id, dept_date, ret_date, proxy = p)

    if page == None:
        invalid_proxy(proxy = p, source='ctripFlight')
        result['error'] = PROXY_INVALID
        return result

    data = ParsePage(page)
    if data == None:
示例#59
0
from lxml import html
import codecs
import db_add
import urllib
import json
import math
import httplib
from common.common  import get_proxy,invalid_proxy
import time

reload(sys)
sys.setdefaultencoding('utf-8')

CITY_TABLE = 'tp_city'

p = get_proxy(source='Platform')
#p = ''

def insert_db(args):
    sql = 'insert ignore into tp_rest_basic_0707(source, city_id, id, res_url) values(%s,%s,%s,%s)'
    return db_add.ExecuteSQLs(sql,args)

def crawl(city_url,city_id):
    global p
    source = 'daodao'
    #city_url = city_url.replace('Tourism','Restaurants')
    print city_url
    mc = MC()
    mc.set_proxy(p)
    print 'proxy:  %s' % p
    page1 = ''
示例#60
0
        dest_id = infos[1]  #机场三字码
        dept_day = infos[2]
        return_day = infos[3]
        dept_date = dept_day[0:4] + '-' + dept_day[4:6] + '-' + dept_day[6:]
        return_date = return_day[0:4] + '-' + return_day[4:6] + '-' + return_day[6:]

    except Exception,e:
        logger.info('lcairRoundFlight: Wrong Content Format with %s'%content)
        return result

    if AIRPORT_CITY_DICT.has_key(dept_id) == False or AIRPORT_CITY_DICT.has_key(dest_id) == False:
        logger.warning('lcairRoundFlight: airport not in AIRPORT_CITY_DICT')
        result['error'] = DATA_NONE
        return result
    
    p = get_proxy(source = 'lcairRoundFlight')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    postdata = getPostData(dept_id, dest_id, dept_date, return_date)

    if postdata == None:
        result['error'] = UNKNOWN_TYPE
        return result
    
    uc = UrllibCrawler(p = p)
    html = uc.post(SearchURL, postdata, html_flag = True)

    #print html