Python crawl_single_page примеры, util.crawl_func.crawl_single_page Python примеры использования

Пример #1

0

Показать файл

Файл: smartfaresRoundParser.py Проект: rongweihe/momoko

def smartfares_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight': flights, 'ticket': tickets}
    result['error'] = 0

    taskcontent = taskcontent.encode('utf-8')
    try:
        dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \
            taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \
            taskcontent.strip().split('&')[3]
    except:
        logger.error('smartfaresFlight::Wrong Content Format with %s' %
                     taskcontent)
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='smartfaresFlight')
    #p= None
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    try:
        search_url = get_search_url(dept_day, dest_day, dept_id, dest_id)
        content = crawl_single_page(search_url, proxy=p, referer=HOST)
        search_id = get_search_id(content)
        if search_id == '' or search_id == None:
            logger.error('smartfares::Parse search id failed')
            result['error'] = PROXY_INVALID
            return result
    except:
        logger.error('smartfares::Parse search id failed')
        result['error'] = PROXY_INVALID
        return result

    url_real = URL % search_id
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content_real = crawl_single_page(url=url_real,
                                         proxy=p,
                                         referer=search_url)
        content_len = len(content_real)
        i += 1

    if len(content_real) > 100:
        parser_result = parsePage(content_real)
        tickets = parser_result['ticket']
        flights = parser_result['flight']
        result['para'] = {'flight': flights, 'ticket': tickets}
        return result
    else:
        result['error'] = DATA_NONE
        return result

Пример #2

0

Показать файл

Файл: smartfaresRoundParser.py Проект: dangpu/momoko

def smartfares_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight':flights, 'ticket':tickets}
    result['error'] = 0

    taskcontent = taskcontent.encode('utf-8')
    try:
        dept_id, dest_id, dept_day, dest_day = taskcontent.strip().split('&')[0], \
            taskcontent.strip().split('&')[1], taskcontent.strip().split('&')[2], \
            taskcontent.strip().split('&')[3]
    except:
        logger.error('smartfaresFlight::Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    p = get_proxy(source='smartfaresFlight')
    #p= None
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    try:
        search_url = get_search_url(dept_day,dest_day,dept_id,dest_id)
        content = crawl_single_page(search_url, proxy=p, referer=HOST)
        search_id = get_search_id(content)
        if search_id == '' or search_id == None:
            logger.error('smartfares::Parse search id failed')
            result['error'] = PROXY_INVALID
            return result
    except:
        logger.error('smartfares::Parse search id failed')
        result['error'] = PROXY_INVALID
        return result

    url_real = URL%search_id
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content_real = crawl_single_page(url=url_real, proxy=p, referer=search_url)
        content_len = len(content_real)
        i += 1

    if len(content_real) > 100:
        parser_result = parsePage(content_real)
        tickets = parser_result['ticket']
        flights = parser_result['flight']
        result['para'] = {'flight':flights, 'ticket':tickets}
        return result
    else:
        result['error'] = DATA_NONE
        return result

Пример #3

0

Показать файл

Файл: tongchengParser.py Проект: rongweihe/momoko

def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy=None):
    parser_url = ''
    url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + \
               dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + \
               '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0' + \
               '&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名' + \
               '&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名' + \
               '&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235'

    page1 = crawl_single_page(url_temp,
                              proxy=proxy,
                              referer='http://www.ly.com')
    print page1

    try:
        num01 = page1.find('(')
        num02 = page1.rfind(')')
        json_content_temp = page1[num01 + 1:num02]
        json_temp1 = json.loads(json_content_temp)
        if json_temp1['state'] == 100:
            url_temp1 = json_temp1['href']
        else:
            return parser_url
    except Exception, e:
        if page1.find('a5') != -1:
            parser_url = 'proxy_forbidden'
        return parser_url

Пример #4

0

Показать файл

def GetInterPricePage(queryLogTransNo, cookie, referer,
                      proxy):  #use_proxy = True):
    priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo

    #p = proxy
    p = None

    #if use_proxy:
    # 如果抓起失败，换一个代理IP，然后重试
    #for i in range(3):
    #p = get_proxy()
    resp = crawl_single_page(priceURL,
                             n=1,
                             referer=referer,
                             proxy=p,
                             cookie=cookie)
    if resp == None or len(resp) == 0:
        #invalid_proxy(p)
        pass
    else:
        return resp
        #logger.info('get price page successful')
    #else:
    #resp = crawl_single_page(searchURL, cookie = cookie)

    return

Пример #5

0

Показать файл

def elong_task_parser(content):

    contents = content.split('&')
    if len(contents) != 2:
        logger.error('elongFlight: wrong content format with %s' % content)
        return None
    location, origdate = contents[0].strip(), contents[1].strip()

    origday = datetime.datetime(string.atoi(origdate[0:4]),
                                string.atoi(origdate[4:6]),
                                string.atoi(origdate[6:]))
    urlday = (origday - datetime.datetime.today()).days
    dept_date = str(origday).split(' ')[0].strip()

    url = URL % (location, urlday)

    p = get_proxy()

    htmlcontent = crawl_single_page(url, proxy=p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error(
            'elongFlight: Proxy Error: htmlcontent is null with proxy: %s' % p)
        return []

    #判断是否返回导航页，返回导航页说明content没有航班信息

    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('elongFilght: Parser Error: cannot find flights with %s' %
                     location)
        return []

    flights = []

    flight_list = temp_flight_list[:-1]

    typ = 0
    for item in flight_list:
        typ = len(typePattern.findall(item))
        if typ == 0:
            pass
        elif typ != 1:
            transfer_info = transferFlight_parser(item, dept_date,
                                                  airports_dict)
            if transfer_info != []:
                flights.append(transfer_info)
        else:
            direct_info = directFlight_parser(item, dept_date, airports_dict)
            if direct_info != []:
                flights.append(direct_info)

    flights_set = set(flights)
    flights = [a for a in flights_set]
    #logger.info('Find %d airlines with %s'%(len(flights),location))

    return flights

Пример #6

0

Показать файл

Файл: feiquanqiuRoundParser.py Проект: rongweihe/momoko

def getPage(url, proxy = None):

    for i in range(2):
        page = crawl_single_page(url, proxy = proxy, n = 1)

        if len(page) > 100:
            return page

    return None

Пример #7

0

Показать файл

Файл: elongHotelRoomRequestParser.py Проект: dangpu/momoko

def get_proxy():
    i = 0
    proxy_len = 0
    while i < 3 and proxy_len < 5:
        proxy = crawl_single_page(proxy_url)
        proxy_len = len(proxy)
        i += 1
    if proxy == '':
        return None
    return proxy

Пример #8

0

Показать файл

def get_proxy():
    i = 0
    proxy_len = 0
    while i < 3 and proxy_len < 5:
        proxy = crawl_single_page(proxy_url)
        proxy_len = len(proxy)
        i += 1
    if proxy == '':
        return None
    return proxy

Пример #9

0

Показать файл

Файл: expediaParser.py Проект: dangpu/momoko

def get_json_url(dept_city=None,dest_city=None,dept_time=None):
    html_url = 'http://www.expedia.com.hk/Flights-Search?trip=oneway&leg1=from:'+dept_city+',to:'+dest_city+',departure:'+dept_time+'TANYT&passengers=children:0,adults:1,seniors:0,infantinlap:Y&options=cabinclass:coach&mode=search&'
    html_res = crawl_single_page(html_url)
    regex = re.compile(r'<div id="originalContinuationId">(.*?)</div>',re.M|re.S|re.I)
    match_id = re.search(regex,html_res)
    if match_id:
        return match_id.group(1).strip('\s')
    else:
        logger.info('not catch the originalContinuationId of json data')
        return False

Пример #10

0

Показать файл

def get_json_url(dept_city=None, dest_city=None, dept_time=None):
    html_url = 'http://www.expedia.com.hk/Flights-Search?trip=oneway&leg1=from:' + dept_city + ',to:' + dest_city + ',departure:' + dept_time + 'TANYT&passengers=children:0,adults:1,seniors:0,infantinlap:Y&options=cabinclass:coach&mode=search&'
    html_res = crawl_single_page(html_url)
    regex = re.compile(r'<div id="originalContinuationId">(.*?)</div>',
                       re.M | re.S | re.I)
    match_id = re.search(regex, html_res)
    if match_id:
        return match_id.group(1).strip('\s')
    else:
        logger.info('not catch the originalContinuationId of json data')
        return False

Пример #11

0

Показать файл

Файл: elongTaskParser.py Проект: dangpu/momoko

def elong_task_parser(content):

    contents = content.split('&')
    if len(contents) != 2:
        logger.error('elongFlight: wrong content format with %s'%content)
        return None
    location, origdate = contents[0].strip(),contents[1].strip()
    
    origday = datetime.datetime(string.atoi(origdate[0:4]),string.atoi(origdate[4:6]),string.atoi(origdate[6:]))
    urlday = (origday - datetime.datetime.today()).days
    dept_date = str(origday).split(' ')[0].strip()
    
    url = URL%(location,urlday)

    p = get_proxy()

    htmlcontent = crawl_single_page(url,proxy = p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('elongFlight: Proxy Error: htmlcontent is null with proxy: %s'%p)
        return []
    
    #判断是否返回导航页，返回导航页说明content没有航班信息
    
    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('elongFilght: Parser Error: cannot find flights with %s'%location)
        return []

    flights = []

    flight_list = temp_flight_list[:-1]

    typ = 0
    for item in flight_list:
        typ = len(typePattern.findall(item))
        if typ == 0:
            pass
        elif typ != 1:
            transfer_info = transferFlight_parser(item,dept_date,airports_dict)
            if transfer_info != []:
                flights.append(transfer_info)
        else:
            direct_info = directFlight_parser(item,dept_date,airports_dict)
            if direct_info != []:
                flights.append(direct_info)
    
    flights_set = set(flights)
    flights = [a for a in flights_set]
    #logger.info('Find %d airlines with %s'%(len(flights),location))


    return flights

Пример #12

0

Показать файл

Файл: easyjetParser.py Проект: dangpu/momoko

def easyjet_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight':flights, 'ticket':tickets}
    result['error'] = 0
    try:
        dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \
                taskcontent.strip().split('&')[1], \
                taskcontent.strip().split('&')[2]
    except:
        logger.error('easyjet::Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    search_url = get_search_url(dept_id, dest_id, dept_day_temp)

    p = get_proxy(source='easyjet')
    
    time_zone_A = airport[dept_id]
    time_zone_B = airport[dest_id]
    #print p
    #print search_url
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    
    i = 0 
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST)
        content_len = len(content)
        i += 1
    
    if content == '' or content == None or len(content) < CONTENT_LEN:
        result['error'] = PROXY_INVALID
        return result

    para =  parsePage(content, p, time_zone_A, time_zone_B)
    
    if para == {'flight':{}, 'ticket':[]}:
        result['error'] = DATA_NONE
        return result
    else:
        flights = para['flight']
        tickets = para['ticket']
        result['para'] = {'ticket':tickets, 'flight':flights}
        return result

Пример #13

0

Показать файл

Файл: easyjetParser.py Проект: rongweihe/momoko

def easyjet_task_parser(taskcontent):
    result = {}
    flights = {}
    tickets = []
    result['para'] = {'flight': flights, 'ticket': tickets}
    result['error'] = 0
    try:
        dept_id, dest_id, dept_day_temp = taskcontent.strip().split('&')[0], \
                taskcontent.strip().split('&')[1], \
                taskcontent.strip().split('&')[2]
    except:
        logger.error('easyjet::Wrong Content Format with %s' % taskcontent)
        result['error'] = TASK_ERROR
        return result

    search_url = get_search_url(dept_id, dest_id, dept_day_temp)

    p = get_proxy(source='easyjet')

    time_zone_A = airport[dept_id]
    time_zone_B = airport[dest_id]
    #print p
    #print search_url
    if p == None:
        result['error'] = PROXY_NONE
        return result

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(url=search_url, proxy=p, n=1, referer=HOST)
        content_len = len(content)
        i += 1

    if content == '' or content == None or len(content) < CONTENT_LEN:
        result['error'] = PROXY_INVALID
        return result

    para = parsePage(content, p, time_zone_A, time_zone_B)

    if para == {'flight': {}, 'ticket': []}:
        result['error'] = DATA_NONE
        return result
    else:
        flights = para['flight']
        tickets = para['ticket']
        result['para'] = {'ticket': tickets, 'flight': flights}
        return result

Пример #14

0

Показать файл

Файл: ctripFlightTaskParser.py Проект: dangpu/momoko

def GetInterPricePage(queryLogTransNo, cookie, referer, use_proxy = True):
    priceURL = "http://flights.ctrip.com/international/GetSubstepSearchResults.aspx?IsJSON=T&queryLogTransNo=%s&QueryType=1&cityPairAirline=first&withDirectAirline=T&RdNo=2103213618&ind=347,359,356,370" % queryLogTransNo
    
    #if use_proxy:
    # 如果抓起失败，换一个代理IP，然后重试，次数当前为0
    for i in range(1):
        p = get_proxy()
        resp = crawl_single_page(priceURL, referer=referer, proxy = p, cookie = cookie)
        if resp == None or len(resp) == 0:
            invalid_proxy(p)
        else:
            return resp
    #else:
        #resp = crawl_single_page(searchURL, cookie = cookie)

    return

Пример #15

0

Показать файл

Файл: tongchengRequestParser.py Проект: dangpu/momoko

def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy = None):
    parser_url = ''
    url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235'

    page1 = crawl_single_page(url_temp, proxy=proxy, n=1,  Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
    
    try:
        num01 = page1.find('(')
        num02 = page1.rfind(')')
        json_content_temp = page1[num01+1:num02]
        json_temp1 = json.loads(json_content_temp)
        if json_temp1['state'] == 100:
            url_temp1 =  json_temp1['href']
        else:
            return parser_url
    except Exception,e:
        logger.error('Can not get url temp 1!')
        return parser_url

Пример #16

0

Показать файл

Файл: tongchengTaskParser.py Проект: rongweihe/momoko

def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy):
    parser_url = ''
    url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235'

    page1 = crawl_single_page(url_temp, proxy=proxy)#, n=1,  Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
    
    try:
        num01 = page1.find('(')
        num02 = page1.rfind(')')
        json_content_temp = page1[num01+1:num02]
        json_temp1 = json.loads(json_content_temp)
        if json_temp1['state'] == 100:
            url_temp1 =  json_temp1['href']
        else:
            return parser_url
        #if json_temp1[''] 
    except Exception,e:
        #logger.error('Can not get url temp 1!')
        return parser_url

Пример #17

0

Показать файл

def csair_task_parser(taskcontent):
    result = {}
    multi_ticket = []
    one_flight = {}
    result['para'] = {'flight': one_flight, 'ticket': multi_ticket}
    result['error'] = 0
    try:
        param_list = taskcontent.strip().split('&')
        url= 'http://b2c.csair.com/B2C40/detail-'+param_list[0]+param_list[1]+'-'+param_list[2]\
      +'-1-0-0-0-1-0-1-0-1-0.g2c'
    except:
        logger.info('url param is not valid\n')
        result['error'] = TASK_ERROR
        return result
    #Initial all params
    dic_flightdate = {}
    multi_price = []
    select_time = 0
    Flag1 = False
    Flag2 = False
    page_flag = False
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    task_content_proxy = get_proxy(source='csairFlight')
    if task_content_proxy == None:
        result['error'] = PROXY_NONE
        return result
    html = crawl_single_page(url, proxy=task_content_proxy)
    if html == '' or html == None:
        result['error'] = PROXY_INVALID
        return result
    pattern = re.compile(r'\s*<FLIGHTS>\s*')
    match = pattern.search(html)
    if match and len(html) > CONTENT_LEN:
        dom = etree.fromstring(html)
        etree.tostring(dom)
        for ele in dom.iter():
            if ele.tag is not None:
                if ele.tag in word_flightdate:
                    #print ele.tag, ele.text
                    dic_flightdate[ele.tag] = ele.text
                elif ele.tag in word_parent_list:
                    page_flag = True  #node of DateFIGHT
                    multi_flight = []
                    Flight = nanhang_flight()
                    select_time += 1
                    flight_num = 0
                    ticket_dur_list = []
                    for word in ele:
                        if word.tag in word_list[0]:
                            flight_num += 1
                            dic_flight = {}
                            EachFlight = nanhang_eachflight()
                            for word_child in word:
                                if word_child.tag in word_child_list[0]:
                                    Flag1 = True
                                    dic_flight[
                                        word_child.
                                        tag] = word_child.text  #each flight
                            if Flag1 == True:
                                try:
                                    Flag1 = False
                                    EachFlight.flight_no = dic_flight[
                                        word_child_list[0][0]]
                                    EachFlight.dept_id = dic_flight[
                                        word_child_list[0][1]]
                                    EachFlight.dest_id = dic_flight[
                                        word_child_list[0][2]]
                                    EachFlight.flight_key = EachFlight.flight_no + '_' + EachFlight.dept_id + '_' + EachFlight.dest_id
                                    dept_time = dic_flight[word_child_list[0]
                                                           [3]]
                                    EachFlight.dept_time = dept_time[
                                        0:10] + 'T' + dept_time[
                                            -5:len(dept_time)]
                                    dest_time = dic_flight[word_child_list[0]
                                                           [4]]
                                    EachFlight.dest_time = dest_time[
                                        0:10] + 'T' + dest_time[
                                            -5:len(dest_time)]
                                    EachFlight.dur = get_duration(
                                        dest_time, EachFlight.dest_id,
                                        dept_time, EachFlight.dept_id)
                                    EachFlight.dept_time = EachFlight.dept_time + ':00'
                                    EachFlight.dest_time = EachFlight.dest_time + ':00'
                                    ticket_dur_list.append(EachFlight.dur)
                                    EachFlight.airline = '南方航空公司'
                                    EachFlight.plane_no = dic_flight[
                                        word_child_list[0]
                                        [5]]  # rebulid and compute flight
                                except KeyError, e:
                                    print e
                                else:
                                    one_flight[EachFlight.flight_key] = (
                                        EachFlight.flight_no,
                                        EachFlight.airline,
                                        EachFlight.plane_no,
                                        EachFlight.dept_id, EachFlight.dest_id,
                                        EachFlight.dept_time,
                                        EachFlight.dest_time, EachFlight.dur)
                                    multi_flight.append((
                                        EachFlight.flight_key,
                                        EachFlight.flight_no,
                                        EachFlight.airline,
                                        EachFlight.plane_no,
                                        EachFlight.dept_id, EachFlight.dest_id,
                                        EachFlight.dept_time,
                                        EachFlight.dest_time,
                                        EachFlight.dur))  #list of multi flight
                        elif word.tag in word_list[1]:
                            multi_price = []  #node of price
                            for word_child in word:
                                if word_child.tag in word_next_list:
                                    dic_ticket = {}
                                    for word_next_child in word_child:
                                        if word_next_child.tag in word_child_list[
                                                1]:
                                            Flag2 = True
                                            dic_ticket[
                                                word_next_child.
                                                tag] = word_next_child.text
                                    if Flag2 == True:
                                        try:
                                            Flag2 = False
                                            Flight.price = string.atof(
                                                dic_ticket[word_child_list[1]
                                                           [0]])
                                            Flight.tax = string.atof(
                                                dic_ticket[word_child_list[1]
                                                           [1]]
                                            ) + string.atof(dic_ticket[
                                                word_child_list[1]
                                                [2]]) + string.atof(dic_ticket[
                                                    word_child_list[1][3]])
                                            Flight.currency = dic_ticket[
                                                word_child_list[1][4]]
                                            Flight.seat_type = dic_ticket[
                                                word_child_list[1][5]]
                                            if Flight.seat_type == 'ECONOMY':
                                                Flight.seat_type = '经济舱'
                                            if Flight.seat_type == 'BUSINESS':
                                                Flight.seat_type = '商务舱'
                                            if Flight.seat_type == 'FIRST':
                                                Flight.seat_type = '头等舱'
                                            if Flight.seat_type == 'PREMIUMECONOMY':
                                                Flight.seat_type = '超经济舱'
                                            Flight.return_rule = 'NULL'
                                            Flight.stop = flight_num - 1
                                            Flight.surcharge = -1
                                            Flight.source = 'csair::csair'
                                        except KeyError, e:
                                            print e
                                        else:
                                            multi_price.append(
                                                (Flight.price, Flight.tax,
                                                 Flight.surcharge,
                                                 Flight.currency,
                                                 Flight.seat_type,
                                                 Flight.source,
                                                 Flight.return_rule,
                                                 Flight.stop))
                    if select_time is not 0:
                        if multi_flight != []:
                            new_flight_no = []
                            Flight.fight_no = '_'.join(
                                [item[1] for item in multi_flight])
                            Flight.plane_no = '_'.join(
                                [item[3] for item in multi_flight])
                            Flight.airline = '_'.join(
                                [item[2] for item in multi_flight])
                            Flight.dept_id = multi_flight[0][4]
                            Flight.dest_id = multi_flight[len(multi_flight) -
                                                          1][5]
                            Flight.dept_day = dic_flightdate[word_flightdate[
                                0]][0:4] + '-' + dic_flightdate[
                                    word_flightdate[0]][
                                        4:6] + '-' + dic_flightdate[
                                            word_flightdate[0]][6:8]
                            Flight.dept_time = multi_flight[0][6]
                            Flight.dest_time = multi_flight[len(multi_flight) -
                                                            1][7]
                            Flight.dur = get_duration(Flight.dest_time,
                                                      Flight.dest_id,
                                                      Flight.dept_time,
                                                      Flight.dept_id)
                            for i in range(len(multi_price)):
                                multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\
                                  Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\
                                  multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))

Пример #18

0

Показать файл

Файл: bookingRoomRequestParser.py Проект: dangpu/momoko

    except Exception,e:
        logger.error('Parse taskcontent failed with ' + str(e))
        return []

    check_in = check_in_temp[:4] + '-' + check_in_temp[4:6] + '-' + check_in_temp[6:]
    check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \
            int(check_in_temp[6:]))
    check_out = str(check_out_temp + datetime.timedelta(days=1))[:10]

    hotel_url = get_hotel_url(url_hotel_name,check_in,check_out)
    print hotel_url
    p = get_proxy()
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(hotel_url, p)
        #content = open('bookingroom3.html','r').read()
        #content_len = len(content)
        if content_len > CONTENT_LEN:
            ##fout = open('bookingroom3.html','w')
            #fout.write(content)
            #fout.close()
            break
        i += 1
    print 'Content len :' + str(content_len)
    room_info = parseRoom(content, check_in, check_out, hotel_id)

    return room_info


def parseRoom(content, check_in, check_out, hotel_id):

Пример #19

0

Показать файл

Файл: biyiRoomParser.py Проект: dangpu/momoko

        return result
   
    p = get_proxy(source='biyiHotel')
    print p
    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = 'http://www.biyi.cn/'
    url = get_url(hotel_name, city_name_en, check_in_day, check_out_day)
    
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(url=first_url, proxy=p, Accept=accept, referer=first_url, n=1)
    #for x in cj:
    #    print x
    #page = with_cookie_crawler(first_url=first_url, second_url=url, proxy=p, min_page_len = 3000)
    #cj = cookielib.CookieJar()
    #opener2 = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    resp2 = crawl_single_page(url=url,proxy=p,Accept=accept,referer=first_url, n=1)

    for y in cj:
        print y
    #print cj
    #cj2.update(cj)
    print '----------------'
    i = 0
    content_len = 0

Пример #20

0

Показать файл

def youzhan_task_parser(taskcontent):
    all_info = []
    room_list = []
    taskcontent = taskcontent.encode('utf-8').strip()
    hotel_id = taskcontent.split('&')[0]
    star = taskcontent.split('&')[2]
    ipathid = taskcontent.split('&')[1]
    city = taskcontent.split('&')[3]
    country = taskcontent.split('&')[4]
    #room_type = taskcontent.split('&')[3]
    from_date_temp = taskcontent.split('&')[5]
    from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \
                + from_date_temp[6:]
    to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                     int(from_date_temp[6:]))
    to_date = str(to_date_temp + datetime.timedelta(days=1))[:10]

    #获取代理

    p = get_proxy()

    #if p == "":
    #logger.error("get proxy failed")
    #return None

    hotel = Hotel()
    room = Room()

    rating_url = get_rating_url(hotel_id)
    rating_page = crawl_single_page(rating_url, proxy=p)

    grade_str = grade_parser(rating_page)

    if grade_str != '':
        hotel.grade = grade_str[:-1]
    else:
        pass
        #logger.error('Error: No grade_str found!')

    map_url = get_map_url(hotel_id)
    map_page = crawl_single_page(map_url, proxy=p)
    #print map_page
    map_info_list = staticmap_parser(map_page)
    if map_info_list != []:
        hotel.hotel_name = map_info_list[1]
        if is_alphabet(hotel.hotel_name.decode('utf-8')) == True:
            hotel.hotel_name_en = hotel.hotel_name
        else:
            hotel.hotel_name_en = 'NULL'
        hotel.map_info = map_info_list[0]
    else:
        logger.error(
            'youzhanHotel: Map info do not have hotel name and map_info')
        return []

    info_url = get_info_url(hotel_id, from_date, to_date)
    info_page = crawl_single_page(info_url, proxy=p)
    if info_page == '':
        #invalid_proxy(p)
        return []
    info_list = info_parser(info_page)

    if info_list != []:
        hotel.country = country
        hotel.city = city
        hotel.address = info_list[1]
        hotel_desc_temp = info_list[3].replace('&lt;br/&gt;',
                                               '').replace('&#039;', '')
        if hotel_desc_temp != '':
            hotel.description = hotel_desc_temp
        else:
            hotel.description = 'NULL'
        hotel.service = info_list[4]

        if '停车场' in hotel.service:
            hotel.has_parking = 'Yes'
        if '无线网络' in hotel.service or 'wifi' in hotel.service:
            hotel.has_wifi = 'Yes'
    else:
        return []

    hotel.source = 'youzhan'
    hotel.source_id = hotel_id
    hotel.star = star

    price_url = get_price_url(hotel_id, ipathid, from_date, to_date)
    price_page = crawl_single_page(price_url, proxy=p)
    price_list = price_parser(price_page, hotel_id)
    #print '********'
    #print price_list
    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2
                room.hotel_name = hotel.hotel_name
                #print '******'
                #print each_room
                room.room_desc = each_room[3]
                room.real_source = each_room[2]

                num = each_room[3].find('-')
                if num > 0:
                    if len(each_room[3][:num]) < 20:
                        room.room_type = each_room[3][:num]
                    else:
                        room.room_type = 'NULL'
                else:
                    if len(each_room[3]) < 20:
                        room.room_type = each_room[3]
                    else:
                        room.room_type = 'NULL'

                if each_room[0] != u'nbsp;':
                    room.price = each_room[0]
                room.has_breakfast = each_room[1]
                room.room_desc = each_room[3]

                if '免费WiFi' in room.room_desc:
                    hotel.is_wifi_free = 'Yes'

                if '免费取消' in room.room_desc:
                    hotel.is_cancel_free = 'Yes'

                room.currency = 'CNY'
                room.source = 'youzhan'
                room.source_hotelid = hotel_id
                room.check_in = from_date
                room.check_out = to_date

                room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\
                    room.source_roomid,room.real_source,room.room_type,room.occupancy,\
                    room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\
                    room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\
                    room.is_breakfast_free,room.is_cancel_free,room.room_desc)
                room_list.append(room_tuple)

    hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\
        hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \
        hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\
        hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description)
    hotel_list = []
    hotel_list.append(hotel_tuple)
    all_info.append(hotel_list)
    all_info.append(room_list)

    return all_info

Пример #21

0

Показать файл

Файл: jijitongRoundParser.py Проект: dangpu/momoko

    #p = crawl_single_page('http://114.215.168.168:8086/proxy')
    p = get_proxy(source='jijitongFlight')
    #print p
    #p = None
    
    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = FIRST_URL % (dept_city_en,dest_city_en,dept_day_temp,dest_day_temp)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(first_url,proxy=p, Accept=accept, referer=HOST, n=1)
    #resp = ''
    #print '________________'
    if resp.find('404错误') < 0:
        url = get_url(dept_city_zh,dest_city_zh,dept_day,dest_day)

        i = 0
        content_len = 0
        while i < 3 and content_len < CONTENT_LEN:
            page = crawl_single_page(url, proxy = p, referer = first_url, n = 1)
            content_len = len(page)
            i += 1


        if page != '' and page != None and len(page) > CONTENT_LEN:
            post_data = get_post_data(page, dept_day, dest_day)

Пример #22

0

Показать файл

Файл: wegoParser.py Проект: rongweihe/momoko

    search_id = get_search_id(url_temp, proxy=p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_FORBIDDEN
        return result

    trip_id = get_trip_id(dept_id, arr_id, dept_date)

    #使用初始url，获取要爬取的页面，page表示一共有多少页
    start_url = get_start_url(search_id, trip_id)
    content_temp = crawl_single_page(
        start_url,
        proxy=p,
        Host="www.wego.cn",
        Accept=
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    )
    if content_temp == "":
        logger.error('Proxy Error: htmlcontent is null with proxy: %s' % p)
        #反馈代理无效
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_INVALID
        return result

    page_num = 0
    page_num = pageParser(content_temp)
    page_num_get = 0

    if page_num == 0:

Пример #23

0

Показать файл

Файл: tongchengParser.py Проект: rongweihe/momoko

    #p = crawl_single_page('http://114.215.168.168:8086/proxy')
    #p = None

    if p == None:
        result['error'] = PROXY_NONE
        return result

    url = get_url(dept_city, dest_city, dept_day, dept_id, dest_id, p)

    if url == 'proxy_forbidden':
        invalid_proxy(proxy=p, source='tongchengFlight')
        result['error'] = PROXY_FORBIDDEN
        return result

    if url != '':
        page = crawl_single_page(url, proxy=p)
        print page
    else:
        logger.error('tongchengFlight: Get url failed!')
        invalid_proxy(proxy=p, source='tongchengFlight')
        result['error'] = PROXY_INVALID
        return result

    if page != '' and len(page) > CONTENT_LEN:
        flights = ParsePage(page)
    else:
        logger.error('tongchengFlight: Crawl page failed!')
        invalid_proxy(proxy=p, source='tongchengFlight')
        result['error'] = PROXY_INVALID
        return result

Пример #24

0

Показать файл

Файл: jijitongParser.py Проект: rongweihe/momoko

    except Exception,e:
        logger.error('jijitongFlight:Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result
    
    p = get_proxy(source='jijitongFlight')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = FIRST_URL % (dept_city_en,dest_city_en,dept_day_temp)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(first_url,proxy=p, \
         Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', n = 1)

    if resp.find('404错误') < 0:
        url = get_url(dept_city_zh,dest_city_zh,dept_day)
        page = crawl_single_page(url, proxy = p, referer = first_url)

        if page != '' and len(page) > 300:
            post_data = get_post_data(page, dept_day)
            price_url = PRICE_URL%str(time.time()*1000)

            price_page = request_post_data(price_url, data=post_data, referer=first_url, \
                                           n=1, proxy=p)
            price_dict = parsePrice(price_page)
            #print price_dict
            time.sleep(5)
            flights = parse_page(page, price_dict)

Пример #25

0

Показать файл

Файл: elongRequestParser.py Проект: dangpu/momoko

        location = dept_id +  '-' + dest_id

        origday = datetime.datetime(string.atoi(dept_date[0:4]),string.atoi(dept_date[5:7]),string.atoi(dept_date[8:]))
        urlday = (origday - datetime.datetime.today()).days
        #dept_date = orig_date
        #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday)))
    except Exception,e:
        logger.error(str(e))
        logger.error('Content Error: Wrong content format with %s'%content)
        return result
    
    url = URL%(location,urlday)

    p = get_proxy(source='elongFlight')

    htmlcontent = crawl_single_page(url,n=1,proxy = p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p)
        return result
    
    #判断是否返回导航页，返回导航页说明content没有航班信息
    
    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('Parser Error: cannot find flights with %s'%location)
        return result

    flight_list = temp_flight_list[:-1]

Пример #26

0

Показать файл

 except Exception, e:
     logger.info('url id wrong :' + e)
     result['error'] = TASK_ERROR
     return result
 cj = cookielib.CookieJar()
 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 urllib2.install_opener(opener)
 url_res = get_json_url(dept_city, dest_city, dept_time)
 if url_res != False:
     url = 'http://www.expedia.com.hk/Flight-Search-Outbound?c=' + url_res + '&_=' + str(
         time.time())
     task_content_proxy = get_proxy(source='expediaFlight')
     if task_content_proxy == None:
         result['error'] = PROXY_NONE
         return result
     html_res = crawl_single_page(url, proxy=task_content_proxy)
     if html_res == '' or html_res == None:
         result['error'] = PROXY_INVALID
         return result
 else:
     result['error'] = TASK_ERROR
     return result
 try:
     json_list = json.loads(html_res)
     if json_list[key_list[0]] == None:
         result['error'] = DATA_NONE
         return result
     search_legs = json_list[key_list[0]][key_list[1]]
     for legs_list in search_legs:
         for legs_key in legs_list:
             if legs_key == key_list[2]:  #legs list

Пример #27

0

Показать файл

    searchURL = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DCity1=%s&ACity1=%s&DDate1=%s&passengerQuantity=1&SendTicketCity=undefined&PassengerType=ADU&SearchType=S&RouteIndex=1&RelDDate=&RelRDate="
    interSearchURL = "http://flights.ctrip.com/international/ShowFareFirst.aspx?flighttype=S&relddate=%s&dcity=%s&acity=%s"

    is_inter = False
    searcURL = ""
    if dept_id.lower() in CN_AIRPORTS and dest_id.lower() in CN_AIRPORTS:
        searchURL = searchURL % (dept_id, dest_id, dept_id, dest_id, dept_date)
    else:
        searchURL = interSearchURL % (dept_date, dept_id, dest_id)
        is_inter = True
    refererURL = "http://flights.ctrip.com/booking/"
    cookie = {}

    p = get_proxy()
    resp = crawl_single_page(searchURL, proxy=p, cookie=cookie)
    if resp == None or len(resp) == 0:
        invalid_proxy(p)
        return -1
    # 2. 解析页面
    try:
        tree = etree.HTML(resp)
    except Exception, e:
        logger.info('etree error: %s' % str(e))
        return -1

    if is_inter or GetTextByXpath(tree, "//title/text()").endswith("携程国际机票"):
        # 国际机票
        if len(tree.xpath("//input[@id='queryLogTransNo']")) > 0:
            queryLogTransNo = tree.xpath(
                "//input[@id='queryLogTransNo']")[0].get("value")

Пример #28

0

Показать файл

Файл: wegoTaskParser.py Проект: dangpu/momoko

    #获取代理
    p = get_proxy(type = '')
    
    #获取初始url
    url_temp = get_url(dept_id,arr_id,dept_date)
    search_id = get_search_id(url_temp,proxy = p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        return None

    trip_id = get_trip_id(dept_id,arr_id,dept_date)
    
    #使用初始url，获取要爬取的页面，page表示一共有多少页
    start_url = get_start_url(search_id,trip_id)
    content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
    if content_temp == "":
        logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p)
        #反馈代理无效
        invalid_proxy(p)
        return None

    page_num = 0 
    page_num = pageParser(content_temp)
    page_num_get = 0

    if  page_num == 0:
        logger.info('Parser Error: cannot find flights with %s - %s'%(dept_id,arr_id))
        return None
    
    #拼出要爬取的urls

Пример #29

0

Показать файл

Файл: wegoTaskParser.py Проект: rongweihe/momoko

    #获取初始url
    url_temp = get_url(dept_id, arr_id, dept_date)
    search_id = get_search_id(url_temp, proxy=p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        return None

    trip_id = get_trip_id(dept_id, arr_id, dept_date)

    #使用初始url，获取要爬取的页面，page表示一共有多少页
    start_url = get_start_url(search_id, trip_id)
    content_temp = crawl_single_page(
        start_url,
        proxy=p,
        Host="www.wego.cn",
        Accept=
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    )
    if content_temp == "":
        logger.error('Proxy Error: htmlcontent is null with proxy: %s' % p)
        #反馈代理无效
        invalid_proxy(p)
        return None

    page_num = 0
    page_num = pageParser(content_temp)
    page_num_get = 0

    if page_num == 0:
        logger.info('Parser Error: cannot find flights with %s - %s' %

Пример #30

0

Показать файл

Файл: ctripFlightTaskParser.py Проект: dangpu/momoko

    searchURL = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DCity1=%s&ACity1=%s&DDate1=%s&passengerQuantity=1&SendTicketCity=undefined&PassengerType=ADU&SearchType=S&RouteIndex=1&RelDDate=&RelRDate="
    interSearchURL = "http://flights.ctrip.com/international/ShowFareFirst.aspx?flighttype=S&relddate=%s&dcity=%s&acity=%s"

    is_inter = False
    searcURL = ""
    if dept_id.lower() in CN_AIRPORTS and dest_id.lower() in CN_AIRPORTS:
        searchURL = searchURL %(dept_id, dest_id, dept_id, dest_id, dept_date)
    else:
        searchURL = interSearchURL %(dept_date, dept_id, dest_id)
        is_inter = True
    refererURL = "http://flights.ctrip.com/booking/"
    cookie = {}

    p = get_proxy()
    resp = crawl_single_page(searchURL, proxy = p, cookie = cookie)
    if resp == None or len(resp) == 0:
        invalid_proxy(p)
        return None

    # 2. 解析页面
    tree = etree.HTML(resp)
    if is_inter or GetTextByXpath(tree, "//title/text()").endswith("携程国际机票"):
        # 国际机票
        queryLogTransNo = tree.xpath("//input[@id='queryLogTransNo']")[0].get("value")
        # 抓取机票价格页面 
        resp = GetInterPricePage(queryLogTransNo, cookie, searchURL)#, use_proxy)
        return ParseInterPage(resp)

    else:   # 国内机票
        return []

Пример #31

0

Показать файл

Файл: expediaParser.py Проект: dangpu/momoko

     dept_time = dept_time[0:4] +'/'+ dept_time[4:6] + '/' + dept_time[6:8]
 except Exception,e:
     logger.info('url id wrong :'+e)
     result['error'] = TASK_ERROR
     return result
 cj = cookielib.CookieJar()
 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 urllib2.install_opener(opener)
 url_res = get_json_url(dept_city,dest_city,dept_time)
 if  url_res != False:
     url = 'http://www.expedia.com.hk/Flight-Search-Outbound?c='+ url_res +'&_='+str(time.time())
     task_content_proxy = get_proxy(source='expediaFlight')
     if task_content_proxy == None:
         result['error'] = PROXY_NONE
         return result
     html_res = crawl_single_page(url, proxy = task_content_proxy)
     if html_res == '' or html_res == None:
         result['error'] = PROXY_INVALID
         return result
 else:
     result['error'] = TASK_ERROR
     return result
 try:
     json_list = json.loads(html_res)
     if json_list[key_list[0]] == None:
         result['error'] = DATA_NONE
         return result
     search_legs = json_list[key_list[0]][key_list[1]]
     for legs_list in search_legs:
         for legs_key in legs_list:
             if legs_key == key_list[2]: #legs list

Пример #32

0

Показать файл

Файл: biyiRoomParser.py Проект: rongweihe/momoko

    p = get_proxy(source='biyiHotel')
    print p
    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = 'http://www.biyi.cn/'
    url = get_url(hotel_name, city_name_en, check_in_day, check_out_day)

    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(url=first_url,
                             proxy=p,
                             Accept=accept,
                             referer=first_url,
                             n=1)
    #for x in cj:
    #    print x
    #page = with_cookie_crawler(first_url=first_url, second_url=url, proxy=p, min_page_len = 3000)
    #cj = cookielib.CookieJar()
    #opener2 = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    resp2 = crawl_single_page(url=url,
                              proxy=p,
                              Accept=accept,
                              referer=first_url,
                              n=1)

    for y in cj:

Пример #33

0

Показать файл

Файл: youzhanRequestParser.py Проект: dangpu/momoko

        #logger.info(ipathid)
        room_type = infos[2]
        checkin_date = infos[3].split('-')[0]#format:2014-05-05
        checkout_date = infos[3].split('-')[1]#format:2014-05-06
        real_source = infos[4].split('::')[-1]
        #logger.info('type' + room_type + ' source' + real_source)
    except Exception,e:
        logger.error('wrong content format' + str(e))
        return -1
    
    p = get_proxy()
    
    room = Room()

    price_url = get_price_url(hotel_id,ipathid,checkin_date,checkout_date)
    price_page = crawl_single_page(price_url,n=1,proxy=p)
    price_list = price_parser(price_page,hotel_id)

    result = 1000000#设置一个极大值

    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                #room.city = city
                #room.occupancy = 1
                #room.hotel_name = hotel.hotel_name
                #print each_room
                #room.room_desc = each_room[3]
                room.real_source = each_room[2]
                                    
                num = each_room[3].find('-')

Пример #34

0

Показать файл

Файл: youzhanTaskParser.py Проект: dangpu/momoko

def youzhan_task_parser(taskcontent):
    all_info = []
    room_list = []
    taskcontent = taskcontent.encode('utf-8').strip()
    hotel_id = taskcontent.split('&')[0]
    star = taskcontent.split('&')[2]
    ipathid = taskcontent.split('&')[1]
    city = taskcontent.split('&')[3]
    country = taskcontent.split('&')[4]
    #room_type = taskcontent.split('&')[3]
    from_date_temp = taskcontent.split('&')[5]
    from_date = from_date_temp[:4] + '-' + from_date_temp[4:6] + '-' \
                + from_date_temp[6:]
    to_date_temp = datetime.datetime(int(from_date_temp[:4]), int(from_date_temp[4:6]), \
                                     int(from_date_temp[6:]))
    to_date = str(to_date_temp + datetime.timedelta(days = 1))[:10]

    #获取代理
    
    p = get_proxy()

    #if p == "":
        #logger.error("get proxy failed")
        #return None
    
    hotel = Hotel()
    room = Room()

    rating_url = get_rating_url(hotel_id)
    rating_page = crawl_single_page(rating_url, proxy=p)
    
    grade_str = grade_parser(rating_page)
    
    if grade_str != '':
        hotel.grade = grade_str[:-1]
    else:
        pass
        #logger.error('Error: No grade_str found!')

    map_url = get_map_url(hotel_id)
    map_page = crawl_single_page(map_url, proxy=p)
    #print map_page
    map_info_list = staticmap_parser(map_page)
    if map_info_list != []:
        hotel.hotel_name = map_info_list[1]
        if is_alphabet(hotel.hotel_name.decode('utf-8')) == True:
            hotel.hotel_name_en = hotel.hotel_name
        else:
            hotel.hotel_name_en = 'NULL'
        hotel.map_info = map_info_list[0]
    else:
        logger.error('youzhanHotel: Map info do not have hotel name and map_info')
        return []

    info_url = get_info_url(hotel_id,from_date,to_date)
    info_page = crawl_single_page(info_url,proxy=p)
    if info_page == '':
        #invalid_proxy(p)
        return []
    info_list = info_parser(info_page)

    if info_list != []:
        hotel.country = country
        hotel.city = city
        hotel.address = info_list[1]
        hotel_desc_temp = info_list[3].replace('&lt;br/&gt;','').replace('&#039;','')
        if hotel_desc_temp != '':
            hotel.description = hotel_desc_temp
        else:
            hotel.description = 'NULL'
        hotel.service = info_list[4]

        if '停车场' in hotel.service:
            hotel.has_parking = 'Yes'
        if '无线网络' in hotel.service or 'wifi' in hotel.service:
            hotel.has_wifi = 'Yes'
    else:
        return []

    hotel.source = 'youzhan'
    hotel.source_id = hotel_id
    hotel.star = star

    price_url = get_price_url(hotel_id,ipathid,from_date,to_date)
    price_page = crawl_single_page(price_url,proxy=p)
    price_list = price_parser(price_page,hotel_id)
    #print '********'
    #print price_list
    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2
                room.hotel_name = hotel.hotel_name
                #print '******'
                #print each_room
                room.room_desc = each_room[3]
                room.real_source = each_room[2]
                

                num = each_room[3].find('-')
                if num > 0:
                    if len(each_room[3][:num]) < 20:
                        room.room_type = each_room[3][:num]
                    else:
                        room.room_type = 'NULL'
                else:
                    if len(each_room[3]) < 20:
                        room.room_type = each_room[3]
                    else:
                        room.room_type = 'NULL'
            
                if each_room[0] != u'nbsp;':
                    room.price = each_room[0]
                room.has_breakfast = each_room[1]
                room.room_desc = each_room[3]

                if '免费WiFi' in room.room_desc:
                    hotel.is_wifi_free = 'Yes'
                
                if '免费取消' in room.room_desc:
                    hotel.is_cancel_free = 'Yes'

                room.currency = 'CNY'
                room.source = 'youzhan'
                room.source_hotelid = hotel_id
                room.check_in = from_date
                room.check_out = to_date

                room_tuple = (room.hotel_name,room.city,room.source,room.source_hotelid,\
                    room.source_roomid,room.real_source,room.room_type,room.occupancy,\
                    room.bed_type,room.size,room.floor,room.check_in,room.check_out,room.price,\
                    room.tax,room.currency,room.is_extrabed,room.is_extrabed_free,room.has_breakfast,\
                    room.is_breakfast_free,room.is_cancel_free,room.room_desc)
                room_list.append(room_tuple)

    hotel_tuple = (hotel.hotel_name, hotel.hotel_name_en,hotel.source,hotel.source_id,hotel.brand_name,\
        hotel.map_info,hotel.address,hotel.city,hotel.country,hotel.postal_code, \
        hotel.star,hotel.grade,hotel.has_wifi,hotel.is_wifi_free,hotel.has_parking,\
        hotel.is_parking_free,hotel.service,hotel.img_items,hotel.description)
    hotel_list = []
    hotel_list.append(hotel_tuple)
    all_info.append(hotel_list)
    all_info.append(room_list)

    return all_info

Пример #35

0

Показать файл

Файл: tongchengRequestParser.py Проект: dangpu/momoko

        dept_city = city_dict[dept_id]
        dest_city = city_dict[dest_id]
        dept_city_cn = city_dict_cn[dept_id].encode('utf-8')
        dest_city_cn = city_dict_cn[dest_id].encode('utf-8')
    except Exception, e:
        logger.error('tongchengFlight: wrong content format with %s'%taskcontent + str(e))
        return -1 

    dept_date = dept_day[:4] + '-' + dept_day[4:6] + '-' +dept_day[6:] #2014-05-10
    dept_time = dept_date + 'T' + dept_minute
    
    #p = get_proxy()  
    url = get_url(dept_city_cn, dest_city_cn, dept_date, dept_city, dest_city)

    if url != '':
        page = crawl_single_page(url, proxy = None, Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
    else:
        return -1
    
    if page != '' and len(page) > 100:
        result = ParsePage(page,flight_no,dept_time)
    else:
        #invalid_proxy(p)
        return -1

    return result


def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy = None):
    parser_url = ''
    url_temp = 'http://www.ly.com/iflight/flightinterajax.aspx?action=SEARCHURL&airplaneInternatType=1&iOrgPort=' + dept_city + '&iArvPort=' + dest_city + '&idtGoDate=' + dept_date + '&idtBackDate=时间/日期&sel_inCabinType=Y&sel_inPassengersType=1&sel_inAdult=1&sel_inChild=0&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&iOrgPortMult=城市名&iArvPortMult=城市名&idtGoDateMult=时间/日期&callback=tc10805565235'

Пример #36

0

Показать файл

Файл: youzhanRequestParser.py Проект: rongweihe/momoko

        #logger.info(ipathid)
        room_type = infos[2]
        checkin_date = infos[3].split('-')[0]  #format:2014-05-05
        checkout_date = infos[3].split('-')[1]  #format:2014-05-06
        real_source = infos[4].split('::')[-1]
        #logger.info('type' + room_type + ' source' + real_source)
    except Exception, e:
        logger.error('wrong content format' + str(e))
        return -1

    p = get_proxy()

    room = Room()

    price_url = get_price_url(hotel_id, ipathid, checkin_date, checkout_date)
    price_page = crawl_single_page(price_url, n=1, proxy=p)
    price_list = price_parser(price_page, hotel_id)

    result = 1000000  #设置一个极大值

    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                #room.city = city
                #room.occupancy = 1
                #room.hotel_name = hotel.hotel_name
                #print each_room
                #room.room_desc = each_room[3]
                room.real_source = each_room[2]

                num = each_room[3].find('-')

Пример #37

0

Показать файл

    #print p
    #p = None

    if p == None:
        result['error'] = PROXY_NONE
        return result

    first_url = FIRST_URL % (dept_city_en, dest_city_en, dept_day_temp,
                             dest_day_temp)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)

    resp = crawl_single_page(first_url,
                             proxy=p,
                             Accept=accept,
                             referer=HOST,
                             n=1)
    #resp = ''
    #print '________________'
    if resp.find('404错误') < 0:
        url = get_url(dept_city_zh, dest_city_zh, dept_day, dest_day)

        i = 0
        content_len = 0
        while i < 3 and content_len < CONTENT_LEN:
            page = crawl_single_page(url, proxy=p, referer=first_url, n=1)
            content_len = len(page)
            i += 1

        if page != '' and page != None and len(page) > CONTENT_LEN:

Пример #38

0

Показать файл

Файл: bookingRoomParser.py Проект: rongweihe/momoko

        check_out = str(check_out_temp + datetime.timedelta(days=1))[:10]

    except Exception, e:
        logger.error('bookingHotel: Wrong Content Format with %s' %
                     taskcontent)
        result['error'] = TASK_ERROR
        return result

    hotel_url = get_hotel_url(url_hotel_name, check_in, check_out)

    p = get_proxy(source='bookingHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    page = crawl_single_page(hotel_url, proxy=p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='bookingHotel')
        result['error'] = PROXY_INVALID
        return result

    if len(page) > CONTENT_LEN:
        room_info = parseRoom(page, check_in, check_out, hotel_id)
    else:
        result['error'] = UNKNOWN_TYPE
        return result

    if room_info != []:
        result['para'] = room_info
        return result
    else:

Пример #39

0

Показать файл

        origday = datetime.datetime(string.atoi(dept_date[0:4]),
                                    string.atoi(dept_date[5:7]),
                                    string.atoi(dept_date[8:]))
        urlday = (origday - datetime.datetime.today()).days
        #dept_date = orig_date
        #logger.info('contents: %s %s %s %s '%(location,flight_no,dept_date,str(urlday)))
    except Exception, e:
        logger.error(str(e))
        logger.error('Content Error: Wrong content format with %s' % content)
        return result

    url = URL % (location, urlday)

    p = get_proxy(source='elongFlight')

    htmlcontent = crawl_single_page(url, n=1, proxy=p)
    if htmlcontent == '':
        invalid_proxy(p)
        logger.error('Proxy Error: htmlcontent is null with proxy: %s' % p)
        return result

    #判断是否返回导航页，返回导航页说明content没有航班信息

    #判断是否找到航班信息，没有返回[]
    temp_flight_list = flightPattern.findall(htmlcontent)
    if len(temp_flight_list) == 1:
        logger.error('Parser Error: cannot find flights with %s' % location)
        return result

    flight_list = temp_flight_list[:-1]

Пример #40

0

Показать файл

Файл: wegoParser.py Проект: dangpu/momoko

    
    #获取初始url
    url_temp = get_url(dept_id,arr_id,dept_date)
    search_id = get_search_id(url_temp,proxy = p)

    if search_id == '':
        logger.error('Search_Id Error: get Search_Id failed')
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_FORBIDDEN
        return result

    trip_id = get_trip_id(dept_id,arr_id,dept_date)
    
    #使用初始url，获取要爬取的页面，page表示一共有多少页
    start_url = get_start_url(search_id,trip_id)
    content_temp = crawl_single_page(start_url,proxy = p, Host="www.wego.cn", Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
    if content_temp == "":
        logger.error('Proxy Error: htmlcontent is null with proxy: %s'%p)
        #反馈代理无效
        invalid_proxy(proxy=p, source='wegoFlight')
        result['error'] = PROXY_INVALID
        return result

    page_num = 0 
    page_num = pageParser(content_temp)
    page_num_get = 0

    if  page_num == 0:
        logger.info('Parser Error: cannot find flights with %s - %s'%(dept_id,arr_id))
        result['error'] = DATA_NONE
        return result

Пример #41

0

Показать файл

Файл: bookingRoomParser.py Проект: dangpu/momoko

        check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), int(check_in_temp[6:]))
        check_out = str(check_out_temp + datetime.timedelta(days=1))[:10]

    except Exception,e:
        logger.error('bookingHotel: Wrong Content Format with %s'%taskcontent)
        result['error'] = TASK_ERROR
        return result

    hotel_url = get_hotel_url(url_hotel_name,check_in,check_out)

    p = get_proxy(source='bookingHotel')
    if p == None:
        result['error'] = PROXY_NONE
        return result

    page = crawl_single_page(hotel_url, proxy = p)
    if page == None or page == '':
        invalid_proxy(proxy=p, source='bookingHotel')
        result['error'] = PROXY_INVALID
        return result
    
    if len(page) > CONTENT_LEN:
        room_info = parseRoom(page, check_in, check_out, hotel_id)
    else:
        result['error'] = UNKNOWN_TYPE
        return result
    
    if room_info != []:
        result['para'] = room_info
        return result
    else:

Пример #42

0

Показать файл

Файл: feifanParser.py Проект: dangpu/momoko

    if p == None:
        result['error'] = PROXY_NONE
        return result

    #生成URL并判断其是否可用
    url = get_url(dept_code, dest_code, dept_day)    
    
    if url == '' or url == None:
        logger.error('feifanFlight: Get url failed!')
        result['error'] = UNKNOWN_TYPE
        return result

    #抓取页面并判断其是否可用
    #feifan常常要刷新才能获取内容，所以爬取3次
    for i in range(3):
        page = crawl_single_page(url, proxy=p)

        if page != '' and len(page) > 300:
            flights = parsePage(page, dept_year)
            if flights == []:
                if page.find('非凡旅行网-抱歉,您没有权限访问') != -1:
                    invalid_proxy(proxy=p, source='feifanFlight')
                    result['error'] = PROXY_FORBIDDEN
                else:
                    result['error'] = DATA_NONE
                return result
            else:
                result['para'] = flights
                return result
        else:
            continue

Пример #43

0

Показать файл

Файл: tongchengTaskParser.py Проект: rongweihe/momoko

        if len(info_list) < 5:
            return []
    except Exception, e:
        logger.error('tongchengFlight,wrong content format with %s'%(taskcontent))
    
    dept_id, dest_id, dept_city, dest_city, dept_date_temp = info_list[0], info_list[1], \
            info_list[2], info_list[3], info_list[4]

    dept_day = dept_date_temp[:4] + '-' + dept_date_temp[4:6] + '-' +dept_date_temp[6:]
    
    p = get_proxy()
        
    url = get_url(dept_city, dest_city, dept_day, dept_id, dest_id, p)

    if url != '':
        page = crawl_single_page(url, proxy = p)#, Accept='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
    else:
        logger.error('tongchengFlight: Get url failed!')
        return flights
    
    if page != '' and len(page) > 100:
        flights = ParsePage(page)
    else:
        logger.error('tongchengFlight: Crawl page failed!')
        return flights

    return flights


def get_url(dept_city, dest_city, dept_date, dept_id, dest_id, proxy):
    parser_url = ''

Пример #44

0

Показать файл

Файл: youzhanRoomParser.py Проект: rongweihe/momoko

    room = Room()

    price_url = get_price_url(hotel_id,ipathid,from_date,to_date)
    i = 0
    content_len = 0
    while i < 5 and content_len < CONTENT_LEN:
        #p = get_proxy()
        p = get_proxy(source='youzhanHotel')
        #print p
        if p == None:
            result['error'] = PROXY_NONE
            return result

        url = price_url + str(int(time.time() * 1000))
        price_page = crawl_single_page(url,proxy=p,n=1)
        content_len = len(price_page)
        i += 1

    if price_page == None or price_page == '':
        invalid_proxy(proxy=p, source='youzhanHotel')
        result['error'] = PROXY_INVALID
        return result
    #print price_page
    price_list = price_parser(price_page,hotel_id)

    if price_list != []:
        for each_room in price_list:
            if len(each_room) > 3:
                room.city = city
                room.occupancy = 2

Пример #45

0

Показать файл

Файл: feifanRequestParser.py Проект: rongweihe/momoko

        dept_date = dept_day[0:4] + '-' + dept_day[4:6] + '-' + dept_day[6:]
        dept_year= dept_date[:4]

        orig_dept_time = dept_date + 'T' + dept_hour + ':00'
    except Exception,e:
        logger.error('feifanFlight: wrong content format with %s'%content + str(e))
        return -1

    #获取代理
    #p = get_proxy()

    #生成URL并判断其是否可用
    url = get_url(dept_id, dest_id, dept_date)    
    
    if url != '' and url != None:
        page = crawl_single_page(url, proxy=None)
    else:
        logger.error('feifanFlight: Get url failed!')
        return -1
    #抓取页面并判断其是否可用
    if page != '' and len(page) > 300:
        result = parsePage(page, dept_year,flight_no, orig_dept_time)
    else:
        logger.error('feifanFlight: Get page content failed!')
        return -1
        
    return result

def parsePage(content,dept_year, flight_no, orig_dept_time):

    result = -1

Пример #46

0

Показать файл

Файл: bookingRoomTaskParser.py Проект: rongweihe/momoko

        return []

    check_in = check_in_temp[:4] + '-' + check_in_temp[
        4:6] + '-' + check_in_temp[6:]
    check_out_temp = datetime.datetime(int(check_in_temp[:4]), int(check_in_temp[4:6]), \
            int(check_in_temp[6:]))
    check_out = str(check_out_temp + datetime.timedelta(days=1))[:10]

    hotel_url = get_hotel_url(url_hotel_name, check_in, check_out)
    logger.info(hotel_url)
    p = get_proxy()
    logger.info(p)
    i = 0
    content_len = 0
    while i < 3 and content_len < CONTENT_LEN:
        content = crawl_single_page(hotel_url, proxy=p)
        #content = open('bookingroom3.html','r').read()
        #content_len = len(content)
        if content_len > CONTENT_LEN:
            ##fout = open('bookingroom3.html','w')
            #fout.write(content)
            #fout.close()
            break
        i += 1
    print 'Content len :' + str(content_len)
    room_info = parseRoom(content, check_in, check_out, hotel_id)

    return room_info


def parseRoom(content, check_in, check_out, hotel_id):

Пример #47

0

Показать файл

Файл: csairParser.py Проект: dangpu/momoko

def csair_task_parser(taskcontent):
  result = {}
  multi_ticket = []
  one_flight = {}
  result['para'] = {'flight':one_flight, 'ticket':multi_ticket}
  result['error'] = 0
  try:
      param_list = taskcontent.strip().split('&')
      url= 'http://b2c.csair.com/B2C40/detail-'+param_list[0]+param_list[1]+'-'+param_list[2]\
    +'-1-0-0-0-1-0-1-0-1-0.g2c'
  except:
      logger.info('url param is not valid\n')
      result['error'] = TASK_ERROR
      return result
  #Initial all params
  dic_flightdate = {}
  multi_price = []
  select_time = 0
  Flag1 = False
  Flag2 = False
  page_flag = False
  cj = cookielib.CookieJar()
  opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  urllib2.install_opener(opener)
  task_content_proxy = get_proxy(source='csairFlight')
  if task_content_proxy == None:
     result['error'] = PROXY_NONE
     return result
  html = crawl_single_page(url, proxy = task_content_proxy)
  if html == '' or html == None:
    result['error'] = PROXY_INVALID
    return result
  pattern = re.compile(r'\s*<FLIGHTS>\s*')
  match = pattern.search(html)
  if match and len(html) > CONTENT_LEN:
    dom =etree.fromstring(html)
    etree.tostring(dom)
    for ele in dom.iter():
         if ele.tag is not None:
           if ele.tag in  word_flightdate:
              #print ele.tag, ele.text
              dic_flightdate[ele.tag] = ele.text
           elif ele.tag in  word_parent_list:
            page_flag = True #node of DateFIGHT
            multi_flight = []
            Flight = nanhang_flight()
            select_time += 1
            flight_num = 0
            ticket_dur_list = []
            for word in ele:
              if word.tag in word_list[0]:             
                flight_num += 1
                dic_flight = {}
                EachFlight = nanhang_eachflight()
                for word_child in word:
                  if word_child.tag  in word_child_list[0]:
                    Flag1 = True
                    dic_flight[word_child.tag]= word_child.text #each flight
                if Flag1 == True:
                    try:
                       Flag1 = False
                       EachFlight.flight_no = dic_flight[word_child_list[0][0]]
                       EachFlight.dept_id = dic_flight[word_child_list[0][1]]
                       EachFlight.dest_id = dic_flight[word_child_list[0][2]]
                       EachFlight.flight_key = EachFlight.flight_no + '_' + EachFlight.dept_id + '_'+ EachFlight.dest_id
                       dept_time = dic_flight[word_child_list[0][3]]
                       EachFlight.dept_time = dept_time[0:10] +'T'+dept_time[-5:len(dept_time)]
                       dest_time = dic_flight[word_child_list[0][4]]
                       EachFlight.dest_time = dest_time[0:10] +'T'+dest_time[-5:len(dest_time)]
                       EachFlight.dur = get_duration(dest_time,EachFlight.dest_id, dept_time,EachFlight.dept_id)
                       EachFlight.dept_time = EachFlight.dept_time + ':00'
                       EachFlight.dest_time = EachFlight.dest_time+ ':00'
                       ticket_dur_list.append(EachFlight.dur)
                       EachFlight.airline = '南方航空公司'
                       EachFlight.plane_no =  dic_flight[word_child_list[0][5]]   # rebulid and compute flight
                    except KeyError,e:
                        print e
                    else:
                        one_flight[EachFlight.flight_key] = (EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur)
                        multi_flight.append((EachFlight.flight_key,EachFlight.flight_no, EachFlight.airline, EachFlight.plane_no,EachFlight.dept_id,EachFlight.dest_id,EachFlight.dept_time, EachFlight.dest_time,EachFlight.dur)) #list of multi flight
              elif word.tag in  word_list[1]:  
                multi_price = [] #node of price
                for word_child in word:
                  if word_child.tag in word_next_list:
                      dic_ticket = {}
                      for word_next_child in word_child:
                        if word_next_child.tag in word_child_list[1]:
                          Flag2 = True
                          dic_ticket[word_next_child.tag] = word_next_child.text
                      if Flag2 == True:
                          try:
                            Flag2 = False
                            Flight.price = string.atof(dic_ticket[word_child_list[1][0]])
                            Flight.tax = string.atof(dic_ticket[word_child_list[1][1]]) + string.atof(dic_ticket[word_child_list[1][2]]) + string.atof(dic_ticket[word_child_list[1][3]])
                            Flight.currency = dic_ticket[word_child_list[1][4]]
                            Flight.seat_type = dic_ticket[word_child_list[1][5]]
                            if Flight.seat_type == 'ECONOMY':
                              Flight.seat_type = '经济舱'
                            if Flight.seat_type =='BUSINESS':
                              Flight.seat_type = '商务舱'
                            if Flight.seat_type == 'FIRST':
                              Flight.seat_type = '头等舱'
                            if Flight.seat_type == 'PREMIUMECONOMY':
                              Flight.seat_type = '超经济舱'
                            Flight.return_rule = 'NULL'
                            Flight.stop = flight_num - 1
                            Flight.surcharge = -1
                            Flight.source = 'csair::csair'
                          except KeyError,e:
                              print e
                          else:
                              multi_price.append((Flight.price, Flight.tax, Flight.surcharge, Flight.currency,Flight.seat_type, Flight.source, Flight.return_rule, Flight.stop))
            if select_time is not 0:
               if multi_flight != []:
                 new_flight_no = []
                 Flight.fight_no = '_'.join([item[1] for item in multi_flight])
                 Flight.plane_no = '_'.join([item[3] for item in multi_flight])
                 Flight.airline = '_'.join([item[2]for item in multi_flight])
                 Flight.dept_id = multi_flight[0][4]
                 Flight.dest_id = multi_flight[len(multi_flight)-1][5]
                 Flight.dept_day = dic_flightdate[word_flightdate[0]][0:4]+'-'+ dic_flightdate[word_flightdate[0]][4:6]+'-'+dic_flightdate[word_flightdate[0]][6:8]
                 Flight.dept_time = multi_flight[0][6]
                 Flight.dest_time = multi_flight[len(multi_flight)-1][7]
                 Flight.dur = get_duration(Flight.dest_time,Flight.dest_id,Flight.dept_time,Flight.dept_id)
                 for i in range(len(multi_price)):
                   multi_ticket.append((Flight.fight_no,Flight.plane_no,Flight.airline,Flight.dept_id,Flight.dest_id,\
                     Flight.dept_day,Flight.dept_time, Flight.dest_time,Flight.dur, multi_price[i][0], multi_price[i][1],\
                     multi_price[i][2], multi_price[i][3],multi_price[i][4],multi_price[i][5],multi_price[i][6], multi_price[i][7]))

Python crawl_single_page примеры использования