def ParseInterPage(page): ''' ''' data = jsonlib.read(page.decode("GBK", "ignore")) allinfo = [] for node in data["FlightList"]: dept_time = datetime.datetime.strptime(node["DepartTime"], '%Y-%m-%d %H:%M:%S') dept_time = str(dept_time).replace( ' ', 'T', ) dest_time = datetime.datetime.strptime(node["ArrivalTime"], '%Y-%m-%d %H:%M:%S') dest_time = str(dest_time).replace( ' ', 'T', ) # 航班信息 flight = Flight() flight.flight_no = '' flight.plane_no = '' flight.airline = '' dept_id_list = [] for flightNode in node["FlightDetail"]: flight.flight_no = flight.flight_no + flightNode["FlightNo"] + '_' flight.airline = flight.airline + flightNode["AirlineName"] + '_' flight.plane_no = flight.plane_no + flightNode["CraftType"] + '_' dept_id_list.append(flightNode["DPort"]) flight.dest_id = flightNode["APort"] flight.stop = len(dept_id_list) flight.dept_id = dept_id_list[0] flight.flight_no = flight.flight_no[:-1] flight.airline = flight.airline[:-1] flight.plane_no = flight.plane_no[:-1] flight.dept_time = dept_time flight.dest_time = dest_time flight.dept_day = flight.dept_time.split('T')[0] flight.price = int(node["Price"]) flight.surcharge = int( GetPriceByClass(node["OilFeeImage"], TaxPriceClasses)) flight.tax = int((GetPriceByClass(node["TaxImage"], TaxPriceClasses))) flight.dur = int(node["FlightTime"]) * 60 #飞行时长,s flight.currency = "CNY" flight.source = "ctrip::ctrip" flight.seat_type = node["ClassName"] allinfo.append((flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,\ flight.dept_day,flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,\ flight.surcharge,flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop)) return allinfo
def DictTuple(self, Dict_list): tickets = [] flight = Flight() for every in Dict_list: flight.plane_type = every['plane_type'] flight.flight_no = every['flight_no'] flight.flight_corp = every['flight_corp'] flight.dept_day = every['dept_day'] flight.stop_time = every['stoptime'] flight.dept_time = every['dept_time'] flight.dest_time = every['dest_time'] flight.stop_id = every['stop_id'] flight.dept_id = every['dept_id'] flight.dest_id = every['dest_id'] flight.dur = every['dur'] flight.rest = every['rest'] flight.stop = every['stop'] flight.return_rule = every['return_rule'] flight.seat_type = every['seat_type'] flight.real_class = every['real_class'] flight.surcharge = every['surcharge'] flight.promotion = every['promotion'] flight.package = every['package'] flight.daydiff = every['daydiff'] flight.price = every['price'] flight.tax = every['tax'] flight_tuple = (flight.flight_no,flight.plane_type,flight.flight_corp,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.rest,flight.price,flight.tax,flight.surcharge,\ flight.promotion,flight.currency,flight.seat_type,flight.real_class,flight.stop_id,flight.stop_time,\ flight.daydiff,flight.source,flight.return_rule,flight.stop) tickets.append(flight_tuple) return tickets
def parseFlightAndTicket(content_temp, time_zone_A, time_zone_B): content = content_temp.encode('utf-8') content = content.replace('£', 'GBP') flights = {} tickets = [] result = {'ticket': tickets, 'flight': flights} flight = Flight() eachflight = EachFlight() try: content_json = json.loads(content) flight_content = content_json['Html'] flight_content = flight_content.replace('\n', '') flight.flight_no = 'EZY' + flight_no_pat.findall(flight_content)[0] flight.airline = 'easyjet' flight.dept_id = dept_id_pat.findall(flight_content)[0] flight.dest_id = dest_id_pat.findall(flight_content)[0] flight.dept_time = dept_time_pat.findall(flight_content)[0].replace( ' ', 'T') + ':00' flight.dest_time = dest_time_pat.findall(flight_content)[0].replace( ' ', 'T') + ':00' flight.price = price_pat.findall(flight_content)[0] flight.seat_type = '经济舱' flight.source = 'easyjet::easyjet' flight.currency = currency_pat.findall(flight_content)[0] flight.stop = 0 flight.dept_day = flight.dept_time.split('T')[0] flight.dur = durCal(flight.dept_time, flight.dest_time, time_zone_A, time_zone_B) eachflight.flight_key = flight.flight_no + '_' + flight.dept_id + '_' + flight.dest_id eachflight.flight_no = flight.flight_no eachflight.airline = 'easyjet' eachflight.dept_id = flight.dept_id eachflight.dest_id = flight.dest_id eachflight.dept_time = flight.dept_time eachflight.dest_time = flight.dest_time eachflight.dur = flight.dur flights[eachflight.flight_key] = (eachflight.flight_no, eachflight.airline, eachflight.plane_no, \ eachflight.dept_id, eachflight.dest_id, eachflight.dept_time, eachflight.dest_time, \ eachflight.dur) tickets = [(flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, flight.dest_id, \ flight.dept_day, flight.dept_time, flight.dest_time, flight.dur, flight.price, \ flight.tax, flight.surcharge, flight.currency, flight.seat_type, \ flight.source, flight.return_rule, flight.stop)] result['flight'] = flights result['ticket'] = tickets #flight_info_json = flight_info_pat.findall(flight_content)[0] #print flight_info_json except Exception, e: print str(e) return result
def parseFlightAndTicket(content_temp, time_zone_A, time_zone_B): content = content_temp.encode('utf-8') content = content.replace('£', 'GBP') flights = {} tickets = [] result = {'ticket':tickets, 'flight':flights} flight = Flight() eachflight = EachFlight() try: content_json = json.loads(content) flight_content = content_json['Html'] flight_content = flight_content.replace('\n','') flight.flight_no = 'EZY' + flight_no_pat.findall(flight_content)[0] flight.airline = 'easyjet' flight.dept_id = dept_id_pat.findall(flight_content)[0] flight.dest_id = dest_id_pat.findall(flight_content)[0] flight.dept_time = dept_time_pat.findall(flight_content)[0].replace(' ','T') + ':00' flight.dest_time = dest_time_pat.findall(flight_content)[0].replace(' ','T') + ':00' flight.price = price_pat.findall(flight_content)[0] flight.seat_type = '经济舱' flight.source = 'easyjet::easyjet' flight.currency = currency_pat.findall(flight_content)[0] flight.stop = 0 flight.dept_day = flight.dept_time.split('T')[0] flight.dur = durCal(flight.dept_time, flight.dest_time, time_zone_A, time_zone_B) eachflight.flight_key = flight.flight_no + '_' + flight.dept_id + '_' + flight.dest_id eachflight.flight_no = flight.flight_no eachflight.airline = 'easyjet' eachflight.dept_id = flight.dept_id eachflight.dest_id = flight.dest_id eachflight.dept_time = flight.dept_time eachflight.dest_time = flight.dest_time eachflight.dur = flight.dur flights[eachflight.flight_key] = (eachflight.flight_no, eachflight.airline, eachflight.plane_no, \ eachflight.dept_id, eachflight.dest_id, eachflight.dept_time, eachflight.dest_time, \ eachflight.dur) tickets = [(flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, flight.dest_id, \ flight.dept_day, flight.dept_time, flight.dest_time, flight.dur, flight.price, \ flight.tax, flight.surcharge, flight.currency, flight.seat_type, \ flight.source, flight.return_rule, flight.stop)] result['flight'] = flights result['ticket'] = tickets #flight_info_json = flight_info_pat.findall(flight_content)[0] #print flight_info_json except Exception, e: print str(e) return result
def ParseInterPage(page): ''' ''' data = jsonlib.read(page.decode("GBK", "ignore")) allinfo = [] for node in data["FlightList"]: dept_time = datetime.datetime.strptime(node["DepartTime"], '%Y-%m-%d %H:%M:%S') dept_time = str(dept_time).replace(' ','T',) dest_time = datetime.datetime.strptime(node["ArrivalTime"], '%Y-%m-%d %H:%M:%S') dest_time = str(dest_time).replace(' ','T',) # 航班信息 flight = Flight() flight.flight_no = '' flight.plane_no = '' flight.airline = '' dept_id_list = [] for flightNode in node["FlightDetail"]: flight.flight_no = flight.flight_no + flightNode["FlightNo"] + '_' flight.airline = flight.airline + flightNode["AirlineName"] + '_' flight.plane_no = flight.plane_no + flightNode["CraftType"] + '_' dept_id_list.append(flightNode["DPort"]) flight.dest_id = flightNode["APort"] flight.stop = len(dept_id_list) flight.dept_id = dept_id_list[0] flight.flight_no = flight.flight_no[:-1] flight.airline = flight.airline[:-1] flight.plane_no = flight.plane_no[:-1] flight.dept_time = dept_time flight.dest_time = dest_time flight.dept_day = flight.dept_time.split('T')[0] flight.price = int(node["Price"]) flight.surcharge = int(GetPriceByClass(node["OilFeeImage"], TaxPriceClasses)) flight.tax = int((GetPriceByClass(node["TaxImage"], TaxPriceClasses))) flight.dur = int(node["FlightTime"]) * 60 #飞行时长,s flight.currency = "CNY" flight.source = "ctrip::ctrip" flight.seat_type = node["ClassName"] allinfo.append((flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,\ flight.dept_day,flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,\ flight.surcharge,flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop)) return allinfo
def ParsePage(tree, params): allinfo = [] nodes = tree.xpath("//div[@class='search_box']") for node in nodes: # 航班信息 flight = Flight() flight.flight_no = GetFlightNo(node.get("id")) strs = node.get("data").split("|") flight.dept_id = strs[2] flight.dest_id = strs[3] flight.airline = GetTextByXpath( node, "table[1]/tr/td[1]/div[1]/span/text()") flight.plane_no = GetAlphanumeric( GetAllText(node.xpath("table[1]/tr/td[1]/div[2]/span")[0])) airport_tax, fuel_surcharge = GetTax( GetTextByXpath(node, "table[1]/tr/td[5]/div[1]/text()")) priceNodes = node.xpath("table[@class='search_table']/tr") for priceNode in priceNodes: # 机票信息 flight.dept_time = str( datetime.datetime.strptime(strs[0], '%Y-%m-%d %H:%M:%S')).replace( ' ', 'T', ) flight.dest_time = str( datetime.datetime.strptime(strs[1], '%Y-%m-%d %H:%M:%S')).replace( ' ', 'T', ) flight.dept_day = flight.dept_time.strftime('%Y-%m-%d') flight.price = int(GetTextByXpath(priceNode, "td[7]/span/text()")) flight.tax = int(airport_tax) flight.surcharge = int(fuel_surcharge) flight.currency = "CNY" flight.source = "ctrip::ctrip" flight.seat_type = GetAllText(priceNode.xpath("td[2]")[0]) allinfo.append((flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,\ flight.dept_day,flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,\ flight.surcharge,flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop)) return allinfo
def ValidatePage(content,dept_year, flight_no, orig_dept_time): result = -1 each_flight_content = each_flight_content_pat.findall(content) if len(each_flight_content) > 0: for each_flight_text in each_flight_content: flight = Flight() try: t_price = all_price_pat.findall(each_flight_text)[0] each_flight_text_temp = each_flight_content_temp_pat.findall(each_flight_text)[0] each_part_flight = each_part_flight_pat.findall(each_flight_text_temp) if len(each_part_flight) >= 1: flight.dept_id = airport_pat.findall(each_part_flight[0])[0][1:-1] flight.dest_id = airport_pat.findall(each_part_flight[-1])[-1][1:-1] dept_time_temp = dept_time_temp_pat.findall(each_part_flight[0])[0] dest_time_temp = dest_time_temp_pat.findall(each_part_flight[-1])[-1] flight.dept_day = dept_year + '-' + dept_time_temp[0].strip() + '-' + dept_time_temp[1].strip() flight.dept_time = flight.dept_day + 'T' + dept_time_temp[2].strip() + ':00' flight.dest_time = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip()[-5:] + ':00' dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip()[-5:] + ':00' dept_time = int(time.mktime(datetime.datetime.strptime(flight.dept_time,'%Y-%m-%dT%H:%M:%S').timetuple())) dest_time = int(time.mktime(datetime.datetime.strptime(flight.dest_time, '%Y-%m-%dT%H:%M:%S').timetuple())) else: continue flight.price = price_pat.findall(each_flight_text)[0] if len(flight.price) > 1: flight.price = int(flight.price[0]) else: flight.price = int(t_price) flight.flight_no = '' for each_flight_text_t in each_part_flight: flight.flight_no = flight.flight_no + flight_no_pat.findall(each_flight_text_t)[0][:8].replace(' ','') + '_' flight.flight_no = flight.flight_no[:-1] if flight.flight_no == flight_no and flight.dept_time == orig_dept_time: result = flight.price break except Exception, e: continue
def ParsePage(tree): allinfo = [] nodes = tree.xpath("//div[@class='search_box']") for node in nodes: # 航班信息 flight = Flight() flight.flight_no = GetFlightNo(node.get("id")) strs = node.get("data").split("|") flight.dept_id = strs[2] flight.dest_id = strs[3] flight.airline = GetTextByXpath(node, "table[1]/tr/td[1]/div[1]/span/text()") flight.plane_no = GetAlphanumeric(GetAllText(node.xpath("table[1]/tr/td[1]/div[2]/span")[0])) airport_tax, fuel_surcharge = GetTax(GetTextByXpath(node, "table[1]/tr/td[5]/div[1]/text()")) priceNodes = node.xpath("table[@class='search_table']/tr") for priceNode in priceNodes: # 机票信息 flight.dept_time = str(datetime.datetime.strptime(strs[0], '%Y-%m-%d %H:%M:%S')).replace(' ','T',) flight.dest_time = str(datetime.datetime.strptime(strs[1], '%Y-%m-%d %H:%M:%S')).replace(' ','T',) flight.dept_day = flight.dept_time.strftime('%Y-%m-%d') flight.price = int(GetTextByXpath(priceNode, "td[7]/span/text()")) flight.tax = int(airport_tax) flight.surcharge = int(fuel_surcharge) flight.currency = "CNY" flight.source = "ctrip::ctrip" flight.seat_type = GetAllText(priceNode.xpath("td[2]")[0]) allinfo.append((flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,\ flight.dept_day,flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,\ flight.surcharge,flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop)) return allinfo
def vuelingvalidate(content, flight_no, req_dept_time): flight_num_list = [] flight_num_info_temp = flight_no_pat.findall(content) if flight_num_info_temp != []: for flight_num_info in flight_num_info_temp: flight_num_temp_1 = flight_num_info.find('|') flight_num_temp_2 = flight_num_info.rfind('~^') if flight_num_temp_2 > 0: flight_num = flight_num_info[flight_num_temp_1+1:flight_num_temp_1+8].replace('~','') + '_' + \ flight_num_info[flight_num_temp_2+2:flight_num_temp_2+9].replace('~','') else: flight_num = flight_num_info[flight_num_temp_1 + 1:flight_num_temp_1 + 8].replace( '~', '') flight_num_list.append(flight_num) dept_id_list = [] dest_id_list = [] station_temp = station_temp_pat.findall(content) for station_temp_a in station_temp: station_info = station_temp_a.replace('\n', '').replace(' ', '') dept_id_num = station_info.find('):') dept_id = station_info[dept_id_num - 3:dept_id_num] dest_id_num = station_info.rfind(')') dest_id = station_info[dest_id_num - 3:dest_id_num] dept_id_list.append(dept_id) dest_id_list.append(dest_id) dept_time_list = [] dest_time_list = [] stops_list = [] flight_time_temp = flight_time_pat.findall(content) for time_temp in flight_time_temp: dept_time = dept_time_pat.findall(time_temp)[0] dest_time = dest_time_pat.findall(time_temp)[0] flight_num = flight_num_pat.findall(time_temp)[0] dept_time_list.append(dept_time) dest_time_list.append(dest_time) stops_list.append(flight_num) price_list = [] price_text = price_pat.findall(content) for price_temp in price_text: price_temp_num = price_temp.rfind('>') + 1 each_price = price_temp[price_temp_num:-3].replace(',', '.') price_list.append(each_price) seat_type_list = ['经济舱', '超经济舱', '公务舱'] seat_type = [] for i in range(len(price_list)): if i % 3 == 0: seat_type.append(seat_type_list[0]) elif i % 3 == 1: seat_type.append(seat_type_list[1]) else: seat_type.append(seat_type_list[2]) flight_no_l,dept_id_l,dest_id_l,dept_time_l,dest_time_l,stops_l = [],[],[],[],[],[] for j in range(len(stops_list)): for k in range(3): flight_no_l.append(flight_num_list[j]) dept_id_l.append(dept_id_list[j]) dest_id_l.append(dest_id_list[j]) dept_time_l.append(dept_time_list[j]) dest_time_l.append(dest_time_list[j]) stops_l.append(stops_list[j]) for i in range(len(price_list)): flight = Flight() flight.flight_no = flight_no_l[i] flight.plane_no = 'NULL' flight.airline = 'vueling' flight.dept_id = dept_id_l[i] flight.dest_id = dest_id_l[i] flight.dept_time = dept_time_l[i] flight.dest_time = dest_time_l[i] dept_time_c = str(dept_time_l[i]).replace('T', ',').replace( '-', ',').replace(':', ',').split(',') + [0, 0, 0] dept_time_t = date_handle(dept_time_c) dest_time_c = str(dest_time_l[i]).replace('T', ',').replace( '-', ',').replace(':', ',').split(',') + [0, 0, 0] dest_time_t = date_handle(dest_time_c) flight.dur = int(time.mktime(dest_time_t)) - int( time.mktime(dept_time_t)) flight.price = price_list[i] flight.dept_day = flight.dept_time[:10] flight.currency = 'EUR' flight.seat_type = seat_type[i] flight.source = 'vueling:vueling' flight.stop = stops_l[i] if flight.flight_no == flight_no and flight.dept_time == req_dept_time: return flight.price else: return result
def parser(content): #get section all_info = [] flights = [] section = section_pat.findall(content) for temp in section: every_flight = [] #get flight number flights_temp = flight_no_pat.findall(temp)[0].split(':') if len(flights_temp) == 1: flight_string1 = flights_temp[0] flight_num = flight_string1[:flight_string1.find('-')] elif len(flights_temp) >= 2: flight_num2 = '' for flight_temp_aplha in flights_temp: flight_num2 = flight_num2 + '_' + flight_temp_aplha[:flight_temp_aplha.find('-')] flight_num = flight_num2 every_flight.append(flight_num[1:]) #get plane number every_flight.append('') #get airline name airline_name = airline_name_pat.findall(temp)[0] every_flight.append(airline_name) #get departure code departure_code = departure_code_pat.findall(temp) every_flight.append(departure_code[0]) #get arrival code arrival_code = arrival_code_pat.findall(temp) arrival_code_length = len(arrival_code) every_flight.append(arrival_code[arrival_code_length-1]) #get departure time departure_time_temp = departure_time_pat.findall(temp) dep_time = '2014 ' + departure_time_temp[0][4:].replace(',','') departure_time = str(datetime.strptime(dep_time,'%Y %d %b %I:%M %p')).replace(' ','T') every_flight.append(str(departure_time)) #get arrival time arrival_time_temp = arrival_time_pat.findall(temp) arrival_time_length = len(arrival_time_temp) arr_time = '2014 ' + arrival_time_temp[arrival_time_length-1][4:].replace(',','') arrival_time = str(datetime.strptime(arr_time, '%Y %d %b %I:%M %p')).replace(' ','T') every_flight.append(str(arrival_time)) #get flight duration flight_dur = [] #day_pat = re.compile(r'(\d*?d)\s*?()') flight_duration = flight_duration_pat.findall(temp) for each_time in flight_duration: day_num = day_pat.findall(each_time) hour_num = hour_pat.findall(each_time) min_num = min_pat.findall(each_time) if day_num != []: day_num_temp = int(day_num[0]) else: day_num_temp = 0 if hour_num != []: hour_num_temp = int(hour_num[0]) else: hour_num_temp = 0 if min_num != []: min_num_temp = int(min_num[0]) else: min_num_temp = 0 flight_dur = day_num_temp * 86400 + hour_num_temp * 3600 + min_num_temp * 60 every_flight.append(flight_dur) """ #get waiting time waiting_time_pat = re.compile(r'<div class="flight-leg2 fl-layover">(.*?)</div>') waiting_time = waiting_time_pat.findall(temp) """ #get tax tax = -1.0 every_flight.append(tax) #get surcharge surcharge = -1.0 every_flight.append(surcharge) #get currency currency = "CNY" every_flight.append(currency) #get seat type seat_type = '经济舱' every_flight.append(seat_type) #get return rule return_rule = '' every_flight.append(return_rule) tickets = [] tickets_info = tickets_info_pat.findall(temp) for each_ticket in tickets_info: ticket = [] #get tickets price tickets_price_temp = tickets_price_pat.findall(each_ticket)[0] m = tickets_price_temp.find('>') + 1 ticket_price = tickets_price_temp[m:].replace(',','') ticket.append(ticket_price) #get ticket source ticket_web = tickets_web_pat.findall(each_ticket)[0] blnum = ticket_web.rfind('/') dnum = ticket_web.rfind('.') ticket_web_name = ticket_web[blnum+1:dnum].replace('-','_') m = ticket_web_name.find('.') if m > 0: ticket_web_name = ticket_web_name[:m] ticket.append('wego::' + ticket_web_name) #get others tickets links ticket_link = tickets_links_pat.findall(each_ticket)[0] ticket.append(ticket_link) tickets.append(ticket) every_flight.append(tickets) #get stops stops_temp = stops_pat.findall(every_flight[0]) stops = len(stops_temp) every_flight.append(stops) #get update time update_time = time.strftime('%Y-%m-%dT%H:%M:%S',time.localtime(time.time())) every_flight.append(update_time) all_info.append(every_flight) for x in all_info: for y in range(len(x[13])): flight = Flight() flight.flight_no = x[0] flight.plane_no = 'NULL'#x[1] flight.airline = x[2] flight.dept_id = x[3] flight.dest_id = x[4] flight.dept_time = x[5] flight.dest_time = x[6] flight.dur = x[7] flight.price = x[13][y][0] flight.tax = x[8] flight.surcharge = x[9] flight.currency = x[10] flight.seat_type = x[11] flight.source = x[13][y][1] flight.return_rule = 'NULL'#x[12] #flight.book_url = 'http://www.wego.cn' + x[13][y][2] flight.stop = x[14] if 'T' in flight.dept_time: flight.dept_day = flight.dept_time.split('T')[0] else: pass flight_t = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,\ flight.dept_day,flight.dept_time,flight.dest_time,flight.dur,flight.price,\ flight.tax,flight.surcharge,flight.currency,flight.seat_type,flight.source,\ flight.return_rule,flight.stop) flights.append(flight_t) return flights
def ParsePage(content): flights = [] if content != '' and len(content) > 100: content_json = json.loads(content) #print content_json['OriginDestinationOption'] if 'OriginDestinationOption' in content_json.keys(): for each_flight_json in content_json['OriginDestinationOption']: #print each_flight_json try: flight = Flight() flight_nums = len(each_flight_json['FlightSegment']) flight.flight_no = each_flight_json['FlightNos'].replace('-','_') flight.dept_id = each_flight_json['AirPorts'][:3] flight.dest_id = each_flight_json['AirPorts'][-3:] #print flight.flight_no,flight.dept_id,flight.dest_id dept_time_tamp = each_flight_json['FlightSegment'][0]['DepartureDate'][6:-2] dest_time_tamp = each_flight_json['FlightSegment'][-1]['ArrivalDate'][6:-2] #flight.dur = int(dest_time_temp) - int(dept_time_temp) #flight.dur = flight.dur / 1000 flight_time_json = each_flight_json['FlightSegment'] if flight_nums == 1: time_str_temp = flight_time_json[0]['FlyTime'].encode('utf8') str_num = time_str_temp.find('小') if str_num < 0: h_nums_str = time_str_temp[:time_str_temp.find('时')].strip() m_nums_str = time_str_temp[time_str_temp.find('时')+3:time_str_temp.find('分')].strip() else: h_nums_str = time_str_temp[:time_str_temp.find('小时')].strip() m_nums_str = time_str_temp[time_str_temp.find('小时')+6:time_str_temp.find('分')].strip() flight.dur = 0 if h_nums_str != '': flight.dur += int(h_nums_str) * 3600 if m_nums_str != '': flight.dur += int(m_nums_str) * 60 else: flight.dur = 0 for i in range(flight_nums): time_str_temp = flight_time_json[i]['FlyTime'].encode('utf8') str_num = time_str_temp.find('小') if str_num > 0: h_nums_str = time_str_temp[:time_str_temp.find('小时')].strip() m_nums_str = time_str_temp[time_str_temp.find('小时')+6:time_str_temp.find('分')].strip() else: h_nums_str = time_str_temp[:time_str_temp.find('时')].strip() m_nums_str = time_str_temp[time_str_temp.find('时')+3:time_str_temp.find('分')].strip() if h_nums_str != '': flight.dur += int(h_nums_str) * 3600 if m_nums_str != '': flight.dur += int(m_nums_str) * 60 for i in range(1,flight_nums): dept_time_temp = each_flight_json['FlightSegment'][i]['DepartureDate'][6:-2] dest_time_temp = each_flight_json['FlightSegment'][i-1]['ArrivalDate'][6:-2] flight.dur += (int(dept_time_temp) - int(dest_time_temp)) / 1000 flight.dept_time = time.strftime('%Y-%m-%d %H:%M:%S', \ time.localtime(float(str(dept_time_tamp)[:-3]))).replace(' ','T') flight.dest_time = time.strftime('%Y-%m-%d %H:%M:%S', \ time.localtime(float(str(dest_time_tamp)[:-3]))).replace(' ','T') flight.dept_day = flight.dept_time.split('T')[0] flight.source = 'tongcheng::tongcheng' flight.stop = int(flight_nums) - 1 #print flight.stop, flight.dept_time, flight.dept_day flight.currency = 'CNY' flight.price = each_flight_json['FareInfo'][0]['TCPrice_Audlt'] flight.tax = each_flight_json['FareInfo'][0]['TaxPrice_Audlt'] #print flight.price,flight.tax airline_temp = '' plane_no_temp = '' #print each_flight_json['FlightSegment'][0] for i in range(flight_nums): plane_no_temp = plane_no_temp + \ each_flight_json['FlightSegment'][i]['Equipment'] + '_' airline_temp = airline_temp + \ each_flight_json['FlightSegment'][i]['AirCompanyName'] + '_' flight.plane_no = plane_no_temp[:-1] flight.airline = airline_temp[:-1] #print plane_no_temp,airline_temp flight.seat_type = '经济舱' flight_tuple = (flight.flight_no, flight.plane_no, flight.airline, \ flight.dept_id, flight.dest_id, flight.dept_day, flight.dept_time, \ flight.dest_time, flight.dur, flight.price, flight.tax, \ flight.surcharge, flight.currency, flight.seat_type, \ flight.source, flight.return_rule, flight.stop) flights.append(flight_tuple) except Exception, e: logger.info('tongchengFlight: Parse this flight failed!' + str(e)) continue else: logger.error('tongchengFlight: Crawl this page failed!') return flights
def parsePage(content,dept_year): flights = [] each_flight_content = each_flight_content_pat.findall(content) if len(each_flight_content) > 0: for each_flight_text in each_flight_content: flight = Flight() try: t_price = all_price_pat.findall(each_flight_text)[0] each_flight_text_temp = each_flight_content_temp_pat.findall(each_flight_text)[0] each_part_flight = each_part_flight_pat.findall(each_flight_text_temp) if len(each_part_flight) >= 1: time.sleep(1) flight.dept_id = airport_pat.findall(each_part_flight[0])[0][1:-1] flight.dest_id = airport_pat.findall(each_part_flight[-1])[-1][1:-1] dept_time_temp = dept_time_temp_pat.findall(each_part_flight[0])[0] dest_time_temp = dest_time_temp_pat.findall(each_part_flight[-1])[-1] flight.dept_day = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dept_time_temp[1].strip() flight.dept_time = flight.dept_day + 'T' + dept_time_temp[2].strip() + ':00' flight.dest_time = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip()[-5:] + ':00' dept_time = int(time.mktime(datetime.datetime.strptime(flight.dept_time, \ '%Y-%m-%dT%H:%M:%S').timetuple())) dest_time = int(time.mktime(datetime.datetime.strptime(flight.dest_time, \ '%Y-%m-%dT%H:%M:%S').timetuple())) flight.dur = dest_time - dept_time + 3600 flight.stop = len(each_part_flight) - 1 else: continue flight.price = price_pat.findall(each_flight_text)[0] if len(flight.price) > 1: flight.price = int(flight.price[0]) else: flight.price = int(t_price) try: flight.tax = int(t_price) - flight.price except: flight.tax = -1.0 logger.info('feifanFlight: Can not parse tax info!') flight.flight_no = '' flight.airline = '' flight.plane_no = '' for each_flight_text_t in each_part_flight: flight.flight_no = flight.flight_no + flight_no_pat.findall(each_flight_text_t)[0][:8].replace(' ','') + '_' flight.plane_no = flight.plane_no + plane_no_pat.findall(each_flight_text_t)[0].replace(' ','') + '_' flight.airline = flight.airline + airline_pat.findall(each_flight_text_t)[0].replace(' ','') + '_' flight.flight_no = flight.flight_no[:-1] flight.plane_no = flight.plane_no[:-1] flight.airline = flight.airline[:-1] flight.return_rule = return_rule_pat.findall(each_flight_text)[0].replace('<p>','').replace('\n','') \ .replace('。','').replace('</p>','。').strip().replace(' ','') flight.currency = 'CNY' flight.source = 'feifan::feifan' flight.seat_type = '经济舱' #print flight.return_rule flight_tuple = (flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, flight.dest_id, \ flight.dept_day, flight.dept_time, flight.dest_time, flight.dur, flight.price, \ flight.tax, flight.surcharge, flight.currency, flight.seat_type, flight.source, \ flight.return_rule, flight.stop) flights.append(flight_tuple) except Exception, e: #logger.info('Parse this flight failed!' + str(e)) continue
# 航班信息 flight = Flight() flight.flight_no = '' flight.plane_no = '' flight.airline = '' dept_id_list = [] for flightNode in node["FlightDetail"]: flight.flight_no = flight.flight_no + flightNode["FlightNo"] + '_' flight.airline = flight.airline + flightNode["AirlineName"] + '_' flight.plane_no = flight.plane_no + flightNode["CraftType"] + '_' dept_id_list.append(flightNode["DPort"]) flight.dest_id = flightNode["APort"] #flight.stop = len(dept_id_list) flight.dept_id = dept_id_list[0] flight.flight_no = flight.flight_no[:-1] #flight.airline = flight.airline[:-1] #flight.plane_no = flight.plane_no[:-1] flight.dept_time = dept_time flight.dest_time = dest_time flight.dept_day = flight.dept_time.split('T')[0] flight.price = int(node["Price"]) #flight.surcharge = int(GetPriceByClass(node["OilFeeImage"], TaxPriceClasses)) #flight.tax = int((GetPriceByClass(node["TaxImage"], TaxPriceClasses))) #flight.dur = int(node["FlightTime"]) * 60 #飞行时长,s #flight.currency = "CNY" #flight.source = "ctrip::ctrip"
def directFlight_parser(flightstring, date, airports_dict): flight = Flight() #直达航班提取出长度为1的列表 cols01 = re.compile(r'<td class="cols01">(.*?)</td>', re.S).findall(flightstring)[0] cols02 = re.compile(r'<td class="cols02">(.*?)</td>', re.S).findall(flightstring)[0] cols03 = re.compile(r'<td class="cols03">(.*?)</td>', re.S).findall(flightstring)[0] cols04 = re.compile(r'<td class="cols04">(.*?)</td>', re.S).findall(flightstring)[0] cols05 = re.compile(r'<td class="cols05">(.*?)</td>', re.S).findall(flightstring)[0] cols06 = re.compile(r'<td class="cols06">(.*?)</td>', re.S).findall(flightstring)[0] aircorp = re.compile(r'</span>(.*?)<br />', re.S).findall(cols01)[0].strip() flight_no = re.compile(r'<br />(.*?) ', re.S).findall(cols01)[0].strip() plane_type = re.compile(r'method="PlaneType" >(.*?)</a>', re.S).findall(cols01)[0].strip() airports = [] days = 0 dept_airport = re.compile(r'</span>(.*?)<br />', re.S).findall(cols02)[0].strip() dept_time = re.compile(r'<span class=" t14 bold black">(.*?)</span>', re.S).findall(cols02)[0].strip() arr_time_airport = re.compile(r'<br />(.*?)$', re.S).findall(cols02)[0].strip() if arr_time_airport.find('+1天') == -1: arr_time, arr_airport = arr_time_airport.split( ' ')[0].strip(), arr_time_airport.split(' ')[-1].strip() else: days += 1 arr_time, arr_airport = arr_time_airport.split(' ')[0].strip().split( '(')[0].strip(), arr_time_airport.split(' ')[-1].strip() airports.append(dept_airport) airports.append(arr_airport) timeinfo = [] during_time = re.compile(r'(.*?)<br />', re.S).findall(cols03)[0].strip() timeinfo.append(dept_time) timeinfo.append(arr_time) timeinfo.append(during_time) during = timeshifter(timeinfo) dept_date = datetime.datetime(string.atoi(date[0:4]), string.atoi(date[5:7]), string.atoi(date[8:])) dest_date = dept_date + datetime.timedelta(days) dept_daytime = date + 'T' + dept_time + ':00' dest_daytime = str(dest_date).split(' ')[0] + 'T' + arr_time + ':00' price = re.compile(r'</span>(.*?)</span>', re.S).findall(cols04)[0].strip() tax = re.compile(r'参考税 ¥(.*?)<div class', re.S).findall(cols04)[0].strip() flight.flight_no = flight_no flight.plane_no = plane_type flight.airline = aircorp if airports_dict.has_key(airports[0]): flight.dept_id = airports_dict[airports[0]] else: flight.dept_id = airports[0] if airports_dict.has_key(airports[-1]): flight.dest_id = airports_dict[airports[-1]] else: flight.dest_id = airports[-1] flight.dept_day = date flight.dept_time = dept_daytime flight.dest_time = dest_daytime flight.dur = during flight.price = float(price) flight.tax = float(tax) flight.surcharge = -1.0 flight.currency = 'CNY' flight.seat_type = '经济舱' flight.source = 'elong::elong' flight.return_rule = 'NULL' flight.stop = 0 flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,\ flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop) return flight_tuple
def parsePage(content, dept_year): flights = [] each_flight_content = each_flight_content_pat.findall(content) if len(each_flight_content) > 0: for each_flight_text in each_flight_content: flight = Flight() try: t_price = all_price_pat.findall(each_flight_text)[0] each_flight_text_temp = each_flight_content_temp_pat.findall( each_flight_text)[0] each_part_flight = each_part_flight_pat.findall( each_flight_text_temp) if len(each_part_flight) >= 1: time.sleep(1) flight.dept_id = airport_pat.findall( each_part_flight[0])[0][1:-1] flight.dest_id = airport_pat.findall( each_part_flight[-1])[-1][1:-1] dept_time_temp = dept_time_temp_pat.findall( each_part_flight[0])[0] dest_time_temp = dest_time_temp_pat.findall( each_part_flight[-1])[-1] flight.dept_day = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dept_time_temp[1].strip() flight.dept_time = flight.dept_day + 'T' + dept_time_temp[ 2].strip() + ':00' flight.dest_time = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip()[-5:] + ':00' dept_time = int(time.mktime(datetime.datetime.strptime(flight.dept_time, \ '%Y-%m-%dT%H:%M:%S').timetuple())) dest_time = int(time.mktime(datetime.datetime.strptime(flight.dest_time, \ '%Y-%m-%dT%H:%M:%S').timetuple())) flight.dur = dest_time - dept_time + 3600 flight.stop = len(each_part_flight) - 1 else: continue flight.price = price_pat.findall(each_flight_text)[0] if len(flight.price) > 1: flight.price = int(flight.price[0]) else: flight.price = int(t_price) try: flight.tax = int(t_price) - flight.price except: flight.tax = -1.0 logger.info('feifanFlight: Can not parse tax info!') flight.flight_no = '' flight.airline = '' flight.plane_no = '' for each_flight_text_t in each_part_flight: flight.flight_no = flight.flight_no + flight_no_pat.findall( each_flight_text_t)[0][:8].replace(' ', '') + '_' flight.plane_no = flight.plane_no + plane_no_pat.findall( each_flight_text_t)[0].replace(' ', '') + '_' flight.airline = flight.airline + airline_pat.findall( each_flight_text_t)[0].replace(' ', '') + '_' flight.flight_no = flight.flight_no[:-1] flight.plane_no = flight.plane_no[:-1] flight.airline = flight.airline[:-1] flight.return_rule = return_rule_pat.findall(each_flight_text)[0].replace('<p>','').replace('\n','') \ .replace('。','').replace('</p>','。').strip().replace(' ','') flight.currency = 'CNY' flight.source = 'feifan::feifan' flight.seat_type = '经济舱' #print flight.return_rule flight_tuple = (flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, flight.dest_id, \ flight.dept_day, flight.dept_time, flight.dest_time, flight.dur, flight.price, \ flight.tax, flight.surcharge, flight.currency, flight.seat_type, flight.source, \ flight.return_rule, flight.stop) flights.append(flight_tuple) except Exception, e: #logger.info('Parse this flight failed!' + str(e)) continue
def elong_page_parser(htmlcontent): ''' ''' tickets = [] flights = {} if htmlcontent.find('您访问的页面不存在或暂时无法访问') != -1: return tickets, flights try: flights_json = flightsPattern.findall(htmlcontent)[0] allflights = json.loads(flights_json)['FlightLegList'] for flightInfo in allflights: flight = Flight() flight.currency = 'CNY' flight.seat_type = '经济舱' flight.stop = len(flightInfo['segs']) - 1 flight.price = int(flightInfo['cabs'][0]['oprice']) flight.tax = int(flightInfo['tax']) flight.source = 'elong::elong' flight.airline = '' flight.plane_no = '' flight.flight_no = '' flight.dur = 0 for singleflightInfo in flightInfo['segs']: eachFlight = EachFlight() eachFlight.flight_no = singleflightInfo['fltno'] eachFlight.plane_no = singleflightInfo['plane'] eachFlight.airline = singleflightInfo['corpn'] eachFlight.dept_id = singleflightInfo['dport'] eachFlight.dest_id = singleflightInfo['aport'] eachFlight.dept_time = time_shifter(singleflightInfo['dtime']) #convert to 2014-07-11T12:06:00 eachFlight.dest_time = time_shifter(singleflightInfo['atime']) eachFlight.dur = int(singleflightInfo['ftime']) * 60 eachFlight.flight_key = eachFlight.flight_no + '_' + eachFlight.dept_id + '_' + eachFlight.dest_id flights[eachFlight.flight_key] = (eachFlight.flight_no, eachFlight.airline, eachFlight.plane_no, eachFlight.dept_id, \ eachFlight.dest_id, eachFlight.dept_time, eachFlight.dest_time, eachFlight.dur) flight.airline = flight.airline + eachFlight.airline + '_' flight.plane_no = flight.plane_no + eachFlight.plane_no + '_' flight.flight_no = flight.flight_no + eachFlight.flight_no + '_' flight.dur += eachFlight.dur if len(flightInfo['segs']) > 1: for i in range(0, len(flightInfo['segs']) - 1): flight.dur += cal_wait_time(time_shifter(flightInfo['segs'][i]['atime']), time_shifter(flightInfo['segs'][i+1]['dtime'])) flight.flight_no = flight.flight_no[:-1] flight.plane_no = flight.plane_no[:-1] flight.airline = flight.airline[:-1] flight.dept_id = flightInfo['segs'][0]['dport'] flight.dest_id = flightInfo['segs'][-1]['aport'] flight.dept_time = time_shifter(flightInfo['segs'][0]['dtime']) flight.dest_time = time_shifter(flightInfo['segs'][-1]['atime']) flight.dept_day = flight.dept_time.split('T')[0] flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,flight.currency,\ flight.seat_type,flight.source,flight.return_rule,flight.stop) tickets.append(flight_tuple) except Exception, e: logger.info(str(e)) return [], {}
def parse_page(content, price_dict): flights = {} tickets = [] result = {'ticket':tickets, 'flight':flights} try: json_temp = json.loads(content) except: return result if json_temp['Status'] == 'SUCCESS': for each_flight_json in json_temp['datalist']: flight = Flight() try: flight.flight_no = each_flight_json['Key'] flight.stop = int(each_flight_json['OW']) flight.price = price_dict[flight.flight_no] #error price flight.tax = each_flight_json['AIP'][0]['TX'] flight.dept_id = each_flight_json['ODO'][0]['OL'] flight.dest_id = each_flight_json['ODO'][-1]['DL'] flight.dept_time = each_flight_json['ODO'][0]['DD'] + ':00' flight.dest_time = each_flight_json['ODO'][-1]['AD'] + ':00' flight.currency = 'CNY' flight.source = 'jijitong::jijitong' flight.seat_type = '经济舱' flight.dept_day = flight.dept_time.split('T')[0] flight_num = len(flight.flight_no.split('_')) if flight_num == 1: dur_A_temp = each_flight_json['ODO'][0]['ET'] flight.dur = int(dur_A_temp) * 60 else: dur_A_temp = 0 dur_A_temp2 = 0 for dept_content in each_flight_json['ODO'][:flight_num]: dur_A_temp += int(dept_content['ET']) * 60 for x in range(1,flight_num): #print x dept_time_str = each_flight_json['ODO'][x-1]['AD'] #print dept_time_str dest_time_str = each_flight_json['ODO'][x]['DD'] #print dest_time_str dur_A_temp2 += durCal(dept_time_str, dest_time_str) #print dur_A_temp2 flight.dur = dur_A_temp + dur_A_temp2 plane_no = '' airline = '' for each_json_temp in each_flight_json['ODO']: plane_no = plane_no + each_json_temp['EQ'] + '_' airline = airline + each_json_temp['COA'] + '_' try: eachflight = EachFlight() eachflight.flight_no = each_json_temp['MA'] eachflight.dept_id = each_json_temp['OL'] eachflight.dest_id = each_json_temp['DL'] eachflight.airline = each_json_temp['COA'] eachflight.plane_no = each_json_temp['EQ'] eachflight.dept_time = each_json_temp['DD'] + ':00' eachflight.dest_time = each_json_temp['AD'] + ':00' eachflight.dur = int(each_json_temp['ET']) * 60 eachflight.flight_key = eachflight.flight_no + '_' + eachflight.dept_id + '_' + eachflight.dest_id eachflight_tuple = (eachflight.flight_no, eachflight.airline, eachflight.plane_no, eachflight.dept_id, \ eachflight.dest_id, eachflight.dept_time, eachflight.dest_time, eachflight.dur) flights[eachflight.flight_key] = eachflight_tuple #print eachflight_tuple except Exception, e: print str(e) continue flight.plane_no = plane_no[:-1] flight.airline = airline[:-1] flight_tuple = (flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, \ flight.dest_id, flight.dept_day, flight.dept_time, flight.dest_time, \ flight.dur, flight.price, flight.tax, flight.surcharge, flight.currency, \ flight.seat_type, flight.source, flight.return_rule, flight.stop) tickets.append(flight_tuple) except Exception,e: logger.error('Can not parse flight info!' + str(e)) continue
result['flight'] = flights result['ticket'] = tickets return result for each_flight_json in flight_json[1]: flight = Flight() try: flight.price = int(each_flight_json[4][0][2]) + 1 flight.tax = int(each_flight_json[4][0][3]) + 1 flight_info_list = each_flight_json[5] flight.dur = int(flight_info_list[0][5]) * 60 flight.dept_id = flight_info_list[0][1] flight.dest_id = flight_info_list[0][3] dept_day_temp = flight_info_list[0][4] flight.dept_day = day_calculator(dept_day_temp) each_flight_list = flight_info_list[0][7] dept_time_mins = int(each_flight_list[0][5]) flight.dept_time = time_calculator(flight.dept_day, dept_time_mins) dest_time_day = each_flight_list[-1][6] dest_time_mins = each_flight_list[-1][7] dest_time_day = day_calculator(dest_time_day) flight.dest_time = time_calculator(dest_time_day, dest_time_mins) flight_no = ''
def parser(content): #get section all_info = [] flights = [] section = section_pat.findall(content) for temp in section: every_flight = [] #get flight number flights_temp = flight_no_pat.findall(temp)[0].split(':') if len(flights_temp) == 1: flight_string1 = flights_temp[0] flight_num = flight_string1[:flight_string1.find('-')] elif len(flights_temp) >= 2: flight_num2 = '' for flight_temp_aplha in flights_temp: flight_num2 = flight_num2 + '_' + flight_temp_aplha[:flight_temp_aplha .find('-')] flight_num = flight_num2 every_flight.append(flight_num[1:]) #get plane number every_flight.append('') #get airline name airline_name = airline_name_pat.findall(temp)[0] every_flight.append(airline_name) #get departure code departure_code = departure_code_pat.findall(temp) every_flight.append(departure_code[0]) #get arrival code arrival_code = arrival_code_pat.findall(temp) arrival_code_length = len(arrival_code) every_flight.append(arrival_code[arrival_code_length - 1]) #get departure time departure_time_temp = departure_time_pat.findall(temp) dep_time = '2014 ' + departure_time_temp[0][4:].replace(',', '') departure_time = str(datetime.strptime(dep_time, '%Y %d %b %I:%M %p')).replace( ' ', 'T') every_flight.append(str(departure_time)) #get arrival time arrival_time_temp = arrival_time_pat.findall(temp) arrival_time_length = len(arrival_time_temp) arr_time = '2014 ' + arrival_time_temp[arrival_time_length - 1][4:].replace(',', '') arrival_time = str(datetime.strptime(arr_time, '%Y %d %b %I:%M %p')).replace( ' ', 'T') every_flight.append(str(arrival_time)) #get flight duration flight_dur = [] #day_pat = re.compile(r'(\d*?d)\s*?()') flight_duration = flight_duration_pat.findall(temp) for each_time in flight_duration: day_num = day_pat.findall(each_time) hour_num = hour_pat.findall(each_time) min_num = min_pat.findall(each_time) if day_num != []: day_num_temp = int(day_num[0]) else: day_num_temp = 0 if hour_num != []: hour_num_temp = int(hour_num[0]) else: hour_num_temp = 0 if min_num != []: min_num_temp = int(min_num[0]) else: min_num_temp = 0 flight_dur = day_num_temp * 86400 + hour_num_temp * 3600 + min_num_temp * 60 every_flight.append(flight_dur) """ #get waiting time waiting_time_pat = re.compile(r'<div class="flight-leg2 fl-layover">(.*?)</div>') waiting_time = waiting_time_pat.findall(temp) """ #get tax tax = -1.0 every_flight.append(tax) #get surcharge surcharge = -1.0 every_flight.append(surcharge) #get currency currency = "CNY" every_flight.append(currency) #get seat type seat_type = '经济舱' every_flight.append(seat_type) #get return rule return_rule = '' every_flight.append(return_rule) tickets = [] tickets_info = tickets_info_pat.findall(temp) for each_ticket in tickets_info: ticket = [] #get tickets price tickets_price_temp = tickets_price_pat.findall(each_ticket)[0] m = tickets_price_temp.find('>') + 1 ticket_price = tickets_price_temp[m:].replace(',', '') ticket.append(ticket_price) #get ticket source ticket_web = tickets_web_pat.findall(each_ticket)[0] blnum = ticket_web.rfind('/') dnum = ticket_web.rfind('.') ticket_web_name = ticket_web[blnum + 1:dnum].replace('-', '_') m = ticket_web_name.find('.') if m > 0: ticket_web_name = ticket_web_name[:m] ticket.append('wego::' + ticket_web_name) #get others tickets links ticket_link = tickets_links_pat.findall(each_ticket)[0] ticket.append(ticket_link) tickets.append(ticket) every_flight.append(tickets) #get stops stops_temp = stops_pat.findall(every_flight[0]) stops = len(stops_temp) every_flight.append(stops) #get update time update_time = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time())) every_flight.append(update_time) all_info.append(every_flight) for x in all_info: for y in range(len(x[13])): flight = Flight() flight.flight_no = x[0] flight.plane_no = 'NULL' #x[1] flight.airline = x[2] flight.dept_id = x[3] flight.dest_id = x[4] flight.dept_time = x[5] flight.dest_time = x[6] flight.dur = x[7] flight.price = x[13][y][0] flight.tax = x[8] flight.surcharge = x[9] flight.currency = x[10] flight.seat_type = x[11] flight.source = x[13][y][1] flight.return_rule = 'NULL' #x[12] #flight.book_url = 'http://www.wego.cn' + x[13][y][2] flight.stop = x[14] if 'T' in flight.dept_time: flight.dept_day = flight.dept_time.split('T')[0] else: pass flight_t = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,\ flight.dept_day,flight.dept_time,flight.dest_time,flight.dur,flight.price,\ flight.tax,flight.surcharge,flight.currency,flight.seat_type,flight.source,\ flight.return_rule,flight.stop) flights.append(flight_t) return flights
flight = Flight() flight.currency = flight_info['coinType'] flight.price = int(flight_info['totalFare']) flight.tax = int(flight_info['totalTax']) flight.seat_type = '经济舱' flight.stop = int(flight_info['transfer']) if flight.stop > 1: print 'found a flight whose transfer_times > 1' continue flight.source = 'lcair::lcair' flight.dept_id, flight.dest_id = flight_info['routeStr'].split( '-')[0], flight_info['routeStr'].split('-')[-1] flight.dept_day = flight_info['fromDate'] flight.flight_no = '' flight.airline = '' flight.plane_no = '' flight_dur = 0 #direct if flight.stop == 0: for single_flight in segments[0]['flights']: flight.flight_no = single_flight['flightNumber'] try: flight.airline = Airline[single_flight['airCo']]
currency = GetCurrency(page) allinfo = [] data = jsonlib.read(data) for k, v in data.items(): for one_day_flights in v: for one_day_flight in one_day_flights[1]: flight = Flight() flight.dept_day = one_day_flights[0] strs = one_day_flight[1].split("~") if len(strs) != 9: continue flight.flight_no = strs[0].strip() + strs[1].strip() flight.dept_id = strs[4].strip() flight.dest_id = strs[6].strip() flight.airline = "ryanair" flight.source = "ryanair::ryanair" dept_time = datetime.datetime.strptime(strs[5], '%m/%d/%Y %H:%M') dest_time = datetime.datetime.strptime(strs[7], '%m/%d/%Y %H:%M') flight.dept_time = str(dept_time).replace(' ', 'T') flight.dest_time = str(dest_time).replace(' ', 'T') flight.stop = 0 days = (dest_time - dept_time).days dur = (dest_time.hour - dept_time.hour) * 3600 + ( dest_time.minute - dept_time.minute) * 60 + days * 86400
currency = GetCurrency(page) allinfo = [] data = jsonlib.read(data) for k, v in data.items(): for one_day_flights in v: for one_day_flight in one_day_flights[1]: flight = Flight() flight.dept_day = one_day_flights[0] strs = one_day_flight[1].split("~") if len(strs) != 9: continue flight.flight_no = strs[0].strip() + strs[1].strip() flight.dept_id = strs[4].strip() flight.dest_id = strs[6].strip() flight.airline = "ryanair" flight.source = "ryanair::ryanair" dept_time = datetime.datetime.strptime(strs[5], '%m/%d/%Y %H:%M') dest_time = datetime.datetime.strptime(strs[7], '%m/%d/%Y %H:%M') flight.dept_time = str(dept_time).replace(' ','T') flight.dest_time = str(dest_time).replace(' ','T') flight.stop = 0 days = (dest_time - dept_time).days dur = (dest_time.hour - dept_time.hour) * 3600 + (dest_time.minute - dept_time.minute) * 60 + days * 86400 flight.dur = dur flight.price = int(GetPrice(one_day_flight[4]))
flight_infos = all_info['airTicketListResponse']['routings'] for flight_info in flight_infos: flight = Flight() flight_aircorp = '' flight_plane = '' flight_no = '' flight.price = int(float(flight_info['adultSalesPrice']) + 1)#解析出数据是小数,取int加1 flight.tax = int(float(flight_info['adultTax']) + 1) flight.dur = int(flight_info['tripTime']) * 60 segments = flight_info['trips'][0]['segments'] flight.dept_id = segments[0]['departureAirportCode'] flight.dest_id = segments[-1]['arrivalAirportCode'] flight.dept_time = timeshifter(segments[0]['departureTime']) flight.dest_time = timeshifter(segments[-1]['arrivalTime']) flight.dept_day = flight.dept_time.split('T')[0] flight.currency = currency flight.seat_type = '经济舱' flight.stop = len(segments) - 1 flight.source = source for segment in segments: flight_aircorp += segment['airlineName'] + '_' flight_plane += segment['aircraftCode'].split(' ')[-1] + '_' #Airbus A330 -> A330 flight_no += segment['airlineCode'] + segment['flightNumber'] + '_' #拼接航空公司代码和航班代码
def transferFlight_parser(flightstring,date,airports_dict): flight = Flight() #中转航班,cols01-03有多个,cols04-06有一个 cols01 = re.compile(r'<td class="cols01">(.*?)</td>',re.S).findall(flightstring) cols02 = re.compile(r'<td class="cols02">(.*?)</td>',re.S).findall(flightstring) cols03 = re.compile(r'<td class="cols03">(.*?)</td>',re.S).findall(flightstring) cols04 = re.compile(r'<td class="cols04">(.*?)</td>',re.S).findall(flightstring)[0] cols05 = re.compile(r'<td class="cols05">(.*?)</td>',re.S).findall(flightstring)[0] cols06 = re.compile(r'<td class="cols06">(.*?)</td>',re.S).findall(flightstring)[0] flight.stop = len(cols01) - 1 if flight.stop > 2: return [] #暂定不要两次以上转机的方案 aircorps = [] flight_nos = [] plane_types = [] dept_times = [] during_times = [] airports = [] days = 0 timeinfo = [] i = 0 for i in range(0,len(cols01)): aircorp = re.compile(r'</span>(.*?)<br />',re.S).findall(cols01[i])[0].strip() flight_no = re.compile(r'<br />(.*?) ',re.S).findall(cols01[i])[0].strip() plane_type = re.compile(r'method="PlaneType" >(.*?)</a>',re.S).findall(cols01[i])[0].strip() dept_airport = re.compile(r'</span>(.*?)<br />',re.S).findall(cols02[i])[0].strip() if dept_airport.find('+2天') != -1: days += 2 elif dept_airport.find('+1天') != -1: days += 1 arr_time_airport = re.compile(r'<br />(.*?)$',re.S).findall(cols02[i])[0].strip() dept_time = re.compile(r'<span class=" t14 bold black">(.*?)</span>',re.S).findall(cols02[i])[0].strip() if arr_time_airport.find('+1天') == -1: arr_time, arr_airport = arr_time_airport.split(' ')[0].strip(),arr_time_airport.split(' ')[-1].strip() else: arr_time, arr_airport = arr_time_airport.split(' ')[0].strip().split('(')[0].strip(),arr_time_airport.split(' ')[-1].strip() if i == len(cols01) - 1: days += 1 during_time = re.compile(r'(.*?)<br />',re.S).findall(cols03[i])[0].strip() aircorps.append(aircorp) flight_nos.append(flight_no) plane_types.append(plane_type) dept_times.append(dept_time) during_times.append(during_time) airports.append(dept_airport) airports.append(arr_airport) timeinfo.append(dept_time) timeinfo.append(arr_time) timeinfo.append(during_time) during = 0#timeshifter(timeinfo) dept_date = datetime.datetime(string.atoi(date[0:4]),string.atoi(date[5:7]),string.atoi(date[8:])) dest_date = dept_date + datetime.timedelta(days) dept_daytime = date + 'T' + timeinfo[0] + ':00' dest_daytime = str(dest_date).split(' ')[0] + 'T' + timeinfo[-2] + ':00' price = re.compile(r'</span>(.*?)</span>',re.S).findall(cols04)[0].strip() tax = re.compile(r'参考税 ¥(.*?)<div class',re.S).findall(cols04)[0].strip() if flight.stop == 1: flight_no_str = flight_nos[0]+'_'+flight_nos[1] plane_no_str = plane_types[0]+'_'+plane_types[1] aircorp_str = aircorps[0]+'_'+aircorps[1] #也可以改为多家航空公司 elif flight.stop == 2: flight_no_str = flight_nos[0]+'_'+flight_nos[1]+'_'+flight_nos[2] plane_no_str = plane_types[0]+'_'+plane_types[1]+'_'+flight_nos[2] aircorp_str = aircorps[0]+'_'+aircorps[1]+'_'+aircorps[2] #也可以改为多家航空公司 else: return [] flight.flight_no = flight_no_str flight.plane_no = plane_no_str flight.airline = aircorp_str if airports_dict.has_key(airports[0]): flight.dept_id = airports_dict[airports[0]] else: flight.dept_id = airports[0] if airports_dict.has_key(airports[-1]): flight.dest_id = airports_dict[airports[-1]] else: flight.dest_id = airports[-1] flight.dept_day = date flight.dept_time = dept_daytime flight.dest_time = dest_daytime flight.dur = during flight.price = int(price) flight.tax = int(tax) flight.surcharge = -1.0 flight.currency = 'CNY' flight.seat_type = '经济舱' flight.source = 'elong::elong' flight.return_rule = 'NULL' #flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ #flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,\ #flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop) #return flight_tuple return flight
else: return flights except Exception, e: logger.error('tongchengFlight:: Crawl this page failed' + str(e)) return flights if 'OriginDestinationOption' in content_json.keys(): for each_flight_json in content_json['OriginDestinationOption']: try: flight = Flight() flight_nums = len(each_flight_json['FlightSegment']) flight.flight_no = each_flight_json['FlightNos'].replace( '-', '_') flight.dept_id = each_flight_json['AirPorts'][:3] flight.dest_id = each_flight_json['AirPorts'][-3:] dept_time_tamp = each_flight_json['FlightSegment'][0][ 'DepartureDate'][6:-2] dest_time_tamp = each_flight_json['FlightSegment'][-1][ 'ArrivalDate'][6:-2] flight_time_json = each_flight_json['FlightSegment'] #parse eachflight content for each_flight_content in flight_time_json: try: eachflight = EachFlight() eachflight.airline = each_flight_content[ 'AirCompanyName'] #print eachflight.airline
def elong_page_parser(htmlcontent): ''' ''' tickets = [] flights = {} if htmlcontent.find('您访问的页面不存在或暂时无法访问') != -1: return tickets, flights try: flights_json = flightsPattern.findall(htmlcontent)[0] allflights = json.loads(flights_json)['FlightLegList'] for flightInfo in allflights: flight = Flight() flight.currency = 'CNY' flight.seat_type = '经济舱' flight.stop = len(flightInfo['segs']) - 1 flight.price = int(flightInfo['cabs'][0]['oprice']) flight.tax = int(flightInfo['tax']) flight.source = 'elong::elong' flight.airline = '' flight.plane_no = '' flight.flight_no = '' flight.dur = 0 for singleflightInfo in flightInfo['segs']: eachFlight = EachFlight() eachFlight.flight_no = singleflightInfo['fltno'] eachFlight.plane_no = singleflightInfo['plane'] eachFlight.airline = singleflightInfo['corpn'] eachFlight.dept_id = singleflightInfo['dport'] eachFlight.dest_id = singleflightInfo['aport'] eachFlight.dept_time = time_shifter( singleflightInfo['dtime']) #convert to 2014-07-11T12:06:00 eachFlight.dest_time = time_shifter(singleflightInfo['atime']) eachFlight.dur = int(singleflightInfo['ftime']) * 60 eachFlight.flight_key = eachFlight.flight_no + '_' + eachFlight.dept_id + '_' + eachFlight.dest_id flights[eachFlight.flight_key] = (eachFlight.flight_no, eachFlight.airline, eachFlight.plane_no, eachFlight.dept_id, \ eachFlight.dest_id, eachFlight.dept_time, eachFlight.dest_time, eachFlight.dur) flight.airline = flight.airline + eachFlight.airline + '_' flight.plane_no = flight.plane_no + eachFlight.plane_no + '_' flight.flight_no = flight.flight_no + eachFlight.flight_no + '_' flight.dur += eachFlight.dur if len(flightInfo['segs']) > 1: for i in range(0, len(flightInfo['segs']) - 1): flight.dur += cal_wait_time( time_shifter(flightInfo['segs'][i]['atime']), time_shifter(flightInfo['segs'][i + 1]['dtime'])) flight.flight_no = flight.flight_no[:-1] flight.plane_no = flight.plane_no[:-1] flight.airline = flight.airline[:-1] flight.dept_id = flightInfo['segs'][0]['dport'] flight.dest_id = flightInfo['segs'][-1]['aport'] flight.dept_time = time_shifter(flightInfo['segs'][0]['dtime']) flight.dest_time = time_shifter(flightInfo['segs'][-1]['atime']) flight.dept_day = flight.dept_time.split('T')[0] flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,flight.currency,\ flight.seat_type,flight.source,flight.return_rule,flight.stop) tickets.append(flight_tuple) except Exception, e: logger.info(str(e)) return [], {}
def transferFlight_parser(flightstring, date, airports_dict): flight = Flight() #中转航班,cols01-03有多个,cols04-06有一个 cols01 = re.compile(r'<td class="cols01">(.*?)</td>', re.S).findall(flightstring) cols02 = re.compile(r'<td class="cols02">(.*?)</td>', re.S).findall(flightstring) cols03 = re.compile(r'<td class="cols03">(.*?)</td>', re.S).findall(flightstring) cols04 = re.compile(r'<td class="cols04">(.*?)</td>', re.S).findall(flightstring)[0] cols05 = re.compile(r'<td class="cols05">(.*?)</td>', re.S).findall(flightstring)[0] cols06 = re.compile(r'<td class="cols06">(.*?)</td>', re.S).findall(flightstring)[0] flight.stop = len(cols01) - 1 if flight.stop > 2: return [] #暂定不要两次以上转机的方案 aircorps = [] flight_nos = [] plane_types = [] dept_times = [] during_times = [] airports = [] days = 0 timeinfo = [] i = 0 for i in range(0, len(cols01)): aircorp = re.compile(r'</span>(.*?)<br />', re.S).findall(cols01[i])[0].strip() flight_no = re.compile(r'<br />(.*?) ', re.S).findall(cols01[i])[0].strip() plane_type = re.compile(r'method="PlaneType" >(.*?)</a>', re.S).findall(cols01[i])[0].strip() dept_airport = re.compile(r'</span>(.*?)<br />', re.S).findall(cols02[i])[0].strip() if dept_airport.find('+2天') != -1: days += 2 elif dept_airport.find('+1天') != -1: days += 1 arr_time_airport = re.compile(r'<br />(.*?)$', re.S).findall(cols02[i])[0].strip() dept_time = re.compile(r'<span class=" t14 bold black">(.*?)</span>', re.S).findall(cols02[i])[0].strip() if arr_time_airport.find('+1天') == -1: arr_time, arr_airport = arr_time_airport.split( ' ')[0].strip(), arr_time_airport.split(' ')[-1].strip() else: arr_time, arr_airport = arr_time_airport.split(' ')[0].strip( ).split('(')[0].strip(), arr_time_airport.split(' ')[-1].strip() if i == len(cols01) - 1: days += 1 during_time = re.compile(r'(.*?)<br />', re.S).findall(cols03[i])[0].strip() aircorps.append(aircorp) flight_nos.append(flight_no) plane_types.append(plane_type) dept_times.append(dept_time) during_times.append(during_time) airports.append(dept_airport) airports.append(arr_airport) timeinfo.append(dept_time) timeinfo.append(arr_time) timeinfo.append(during_time) during = timeshifter(timeinfo) dept_date = datetime.datetime(string.atoi(date[0:4]), string.atoi(date[5:7]), string.atoi(date[8:])) dest_date = dept_date + datetime.timedelta(days) dept_daytime = date + 'T' + timeinfo[0] + ':00' dest_daytime = str(dest_date).split(' ')[0] + 'T' + timeinfo[-2] + ':00' price = re.compile(r'</span>(.*?)</span>', re.S).findall(cols04)[0].strip() tax = re.compile(r'参考税 ¥(.*?)<div class', re.S).findall(cols04)[0].strip() if flight.stop == 1: flight_no_str = flight_nos[0] + '_' + flight_nos[1] plane_no_str = plane_types[0] + '_' + plane_types[1] aircorp_str = aircorps[0] + '_' + aircorps[1] #也可以改为多家航空公司 elif flight.stop == 2: flight_no_str = flight_nos[0] + '_' + flight_nos[1] + '_' + flight_nos[ 2] plane_no_str = plane_types[0] + '_' + plane_types[ 1] + '_' + flight_nos[2] aircorp_str = aircorps[0] + '_' + aircorps[1] + '_' + aircorps[ 2] #也可以改为多家航空公司 else: return [] flight.flight_no = flight_no_str flight.plane_no = plane_no_str flight.airline = aircorp_str if airports_dict.has_key(airports[0]): flight.dept_id = airports_dict[airports[0]] else: flight.dept_id = airports[0] if airports_dict.has_key(airports[-1]): flight.dest_id = airports_dict[airports[-1]] else: flight.dest_id = airports[-1] flight.dept_day = date flight.dept_time = dept_daytime flight.dest_time = dest_daytime flight.dur = during flight.price = int(price) flight.tax = int(tax) flight.surcharge = -1.0 flight.currency = 'CNY' flight.seat_type = '经济舱' flight.source = 'elong::elong' flight.return_rule = 'NULL' flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,\ flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop) return flight_tuple
def ceair_page_parser(content): flights = {} tickets = [] infos = json.loads(content[content.find('{'):]) if infos['resultMsg'] != '': return tickets, flights currency = infos['currency'] all_flights = infos['tripItemList'][0]['airRoutingList'] for one_flight in all_flights: flight_info = one_flight['flightList'] flight = Flight() flight.source = 'ceair::ceair' flight.stop = len(flight_info) - 1 flight.currency = currency flight_nos = [] plane_types = [] airlines = [] durings = [] wait_times = [] flight.dept_id = flight_info[0]['deptCd'] flight.dest_id = flight_info[-1]['arrCd'] flight.dept_time = standard_timeformatter(flight_info[0]['deptTime']) flight.dest_time = standard_timeformatter(flight_info[-1]['arrTime']) flight.dept_day = flight_info[0]['deptTime'].split(' ')[0] for item in flight_info: eachflight = EachFlight() eachflight.flight_no = item['flightNo'] eachflight.airline = '东方航空' eachflight.plane_no = item['acfamily'] eachflight.dept_id = item['deptCd'] eachflight.dest_id = item['arrCd'] eachflight.dept_time = standard_timeformatter(item['deptTime']) eachflight.dest_time = standard_timeformatter(item['arrTime']) eachflight.dur = hm_to_sec(item['duration']) eachflight.flight_key = eachflight.flight_no + '_' + eachflight.dept_id + '_' + eachflight.dest_id flights[eachflight.flight_key] = (eachflight.flight_no, eachflight.airline, eachflight.plane_no, eachflight.dept_id, eachflight.dest_id, eachflight.dept_time, eachflight.dest_time, eachflight.dur) flight_nos.append(eachflight.flight_no) plane_types.append(eachflight.plane_no) airlines.append(eachflight.airline) durings.append(eachflight.dur) wait_times.append(hm_to_sec(item['stayTime'])) flight.flight_no = '' for flight_no in flight_nos: flight.flight_no = flight.flight_no + flight_no + '_' flight.flight_no = flight.flight_no[:-1] flight.plane_no = '' for plane_type in plane_types: flight.plane_no = flight.plane_no + plane_type + '_' flight.plane_no = flight.plane_no[:-1] flight.airline = '' for airline in airlines: flight.airline = flight.airline + airline + '_' flight.airline = flight.airline[:-1] flight.dur = 0 for during in durings: flight.dur = flight.dur + during for wait_time in wait_times: flight.dur = flight.dur + wait_time if one_flight['priceDisp']['economy'] != '': flight.seat_type = '经济舱' flight.price = int(one_flight['priceDisp']['economy']) flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,\ flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop) tickets.append(flight_tuple) if one_flight['priceDisp']['business'] != '': flight.seat_type = '商务舱' flight.price = int(one_flight['priceDisp']['business']) flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,\ flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop) tickets.append(flight_tuple) return tickets, flights
[1].xpath('text()')[0]) for p in price_list: price += p except Exception, e: logger.error('airfranceFlight :: price_value class not found!') result['error'] = PARSE_ERROR return result price = float(price) flight = Flight() flight.tax = 0 flight.flight_no = flight_no flight.plane_type = plane_type flight.flight_corp = flight_corp flight.dept_id = dept_id flight.dest_id = dest_id flight.dept_day = dept_day flight.dept_time = dept_time flight.dest_time = dest_time flight.dur = dur flight.price = price flight.currency = currency flight.seat_type = seat_type flight.real_class = real_class flight.stop_id = stop_id flight.stop_time = stop_time flight.daydiff = daydiff flight.source = source flight.stop = stop flight_tuple = (flight.flight_no,flight.plane_type,flight.flight_corp,flight.dept_id,flight.dest_id,flight.dept_day,\
flight = Flight() flight.currency = flight_info['coinType'] flight.price = int(flight_info['totalFare']) flight.tax = int(flight_info['totalTax']) flight.seat_type = '经济舱' flight.stop = int(flight_info['transfer']) if flight.stop > 1: print 'found a flight whose transfer_times > 1' continue flight.source = 'lcair::lcair' flight.dept_id, flight.dest_id = flight_info['routeStr'].split('-')[0], flight_info['routeStr'].split('-')[-1] flight.dept_day = flight_info['fromDate'] flight.flight_no = '' flight.airline = '' flight.plane_no = '' flight_dur = 0 #direct if flight.stop == 0: for single_flight in segments[0]['flights']: flight.flight_no = single_flight['flightNumber'] try: flight.airline = Airline[single_flight['airCo']]
def ValidatePage(content, dept_year, flight_no, orig_dept_time): result = -1 each_flight_content = each_flight_content_pat.findall(content) if len(each_flight_content) > 0: for each_flight_text in each_flight_content: flight = Flight() try: t_price = all_price_pat.findall(each_flight_text)[0] each_flight_text_temp = each_flight_content_temp_pat.findall( each_flight_text)[0] each_part_flight = each_part_flight_pat.findall( each_flight_text_temp) if len(each_part_flight) >= 1: flight.dept_id = airport_pat.findall( each_part_flight[0])[0][1:-1] flight.dest_id = airport_pat.findall( each_part_flight[-1])[-1][1:-1] dept_time_temp = dept_time_temp_pat.findall( each_part_flight[0])[0] dest_time_temp = dest_time_temp_pat.findall( each_part_flight[-1])[-1] flight.dept_day = dept_year + '-' + dept_time_temp[ 0].strip() + '-' + dept_time_temp[1].strip() flight.dept_time = flight.dept_day + 'T' + dept_time_temp[ 2].strip() + ':00' flight.dest_time = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip()[-5:] + ':00' dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip( )[-5:] + ':00' dept_time = int( time.mktime( datetime.datetime.strptime( flight.dept_time, '%Y-%m-%dT%H:%M:%S').timetuple())) dest_time = int( time.mktime( datetime.datetime.strptime( flight.dest_time, '%Y-%m-%dT%H:%M:%S').timetuple())) else: continue flight.price = price_pat.findall(each_flight_text)[0] if len(flight.price) > 1: flight.price = int(flight.price[0]) else: flight.price = int(t_price) flight.flight_no = '' for each_flight_text_t in each_part_flight: flight.flight_no = flight.flight_no + flight_no_pat.findall( each_flight_text_t)[0][:8].replace(' ', '') + '_' flight.flight_no = flight.flight_no[:-1] if flight.flight_no == flight_no and flight.dept_time == orig_dept_time: result = flight.price break except Exception, e: continue
def directFlight_parser(flightstring,date,airports_dict): flight = Flight() #直达航班提取出长度为1的列表 cols01 = re.compile(r'<td class="cols01">(.*?)</td>',re.S).findall(flightstring)[0] cols02 = re.compile(r'<td class="cols02">(.*?)</td>',re.S).findall(flightstring)[0] cols03 = re.compile(r'<td class="cols03">(.*?)</td>',re.S).findall(flightstring)[0] cols04 = re.compile(r'<td class="cols04">(.*?)</td>',re.S).findall(flightstring)[0] cols05 = re.compile(r'<td class="cols05">(.*?)</td>',re.S).findall(flightstring)[0] cols06 = re.compile(r'<td class="cols06">(.*?)</td>',re.S).findall(flightstring)[0] aircorp = re.compile(r'</span>(.*?)<br />',re.S).findall(cols01)[0].strip() flight_no = re.compile(r'<br />(.*?) ',re.S).findall(cols01)[0].strip() plane_type = re.compile(r'method="PlaneType" >(.*?)</a>',re.S).findall(cols01)[0].strip() airports = [] days = 0 dept_airport = re.compile(r'</span>(.*?)<br />',re.S).findall(cols02)[0].strip() dept_time = re.compile(r'<span class=" t14 bold black">(.*?)</span>',re.S).findall(cols02)[0].strip() arr_time_airport = re.compile(r'<br />(.*?)$',re.S).findall(cols02)[0].strip() if arr_time_airport.find('+1天') == -1: arr_time, arr_airport = arr_time_airport.split(' ')[0].strip(),arr_time_airport.split(' ')[-1].strip() else: days += 1 arr_time, arr_airport = arr_time_airport.split(' ')[0].strip().split('(')[0].strip(),arr_time_airport.split(' ')[-1].strip() airports.append(dept_airport) airports.append(arr_airport) timeinfo = [] during_time = re.compile(r'(.*?)<br />',re.S).findall(cols03)[0].strip() timeinfo.append(dept_time) timeinfo.append(arr_time) timeinfo.append(during_time) during = 0#timeshifter(timeinfo) dept_date = datetime.datetime(string.atoi(date[0:4]),string.atoi(date[5:7]),string.atoi(date[8:])) dest_date = dept_date + datetime.timedelta(days) dept_daytime = date + 'T' + dept_time + ':00' dest_daytime = str(dest_date).split(' ')[0] + 'T' + arr_time + ':00' price = re.compile(r'</span>(.*?)</span>',re.S).findall(cols04)[0].strip() tax = re.compile(r'参考税 ¥(.*?)<div class',re.S).findall(cols04)[0].strip() flight.flight_no = flight_no flight.plane_no = plane_type flight.airline = aircorp if airports_dict.has_key(airports[0]): flight.dept_id = airports_dict[airports[0]] else: flight.dept_id = airports[0] if airports_dict.has_key(airports[-1]): flight.dest_id = airports_dict[airports[-1]] else: flight.dest_id = airports[-1] flight.dept_day = date flight.dept_time = dept_daytime flight.dest_time = dest_daytime flight.dur = during flight.price = float(price) flight.tax = float(tax) flight.surcharge = -1.0 flight.currency = 'CNY' flight.seat_type = '经济舱' flight.source = 'elong::elong' flight.return_rule = 'NULL' flight.stop = 0 #flight_tuple = (flight.flight_no,flight.plane_no,flight.airline,flight.dept_id,flight.dest_id,flight.dept_day,\ #flight.dept_time,flight.dest_time,flight.dur,flight.price,flight.tax,flight.surcharge,\ #flight.currency,flight.seat_type,flight.source,flight.return_rule,flight.stop) #return flight_tuple return flight
def vuelingparser(content,flight_no,req_dept_time): #allinfos = [] #get flight num flight_num_list = [] flight_num_info_temp = flight_no_pat.findall(content) if flight_num_info_temp != []: for flight_num_info in flight_num_info_temp: flight_num_temp_1 = flight_num_info.find('|') flight_num_temp_2 = flight_num_info.rfind('~^') if flight_num_temp_2 > 0: flight_num = flight_num_info[flight_num_temp_1+1:flight_num_temp_1+8]\ .replace('~','') + '_' + \ flight_num_info[flight_num_temp_2+2:flight_num_temp_2+9].replace('~','') else: flight_num = flight_num_info[flight_num_temp_1+1:flight_num_temp_1+8].replace('~','') flight_num_list.append(flight_num) #get station information #set station_temp,dept_id and dest_id pattern dept_id_list = [] dest_id_list = [] station_temp = station_temp_pat.findall(content) for station_temp_a in station_temp: station_info = station_temp_a.replace('\n', '').replace(' ','') dept_id_num = station_info.find('):') dept_id = station_info[dept_id_num-3:dept_id_num] dest_id_num = station_info.rfind(')') dest_id = station_info[dest_id_num-3:dest_id_num] dept_id_list.append(dept_id) dest_id_list.append(dest_id) #get flight_time information #set dept_time,dest_time,flight_time pattern dept_time_list = [] dest_time_list = [] stops_list = [] flight_time_temp = flight_time_pat.findall(content) for time_temp in flight_time_temp: dept_time = dept_time_pat.findall(time_temp)[0] dest_time = dest_time_pat.findall(time_temp)[0] flight_num = flight_num_pat.findall(time_temp)[0] dept_time_list.append(dept_time) dest_time_list.append(dest_time) stops_list.append(flight_num) #get each kind flight price price_list = [] price_text = price_pat.findall(content) for price_temp in price_text: price_temp_num = price_temp.rfind('>') + 1 each_price = price_temp[price_temp_num:-3].replace(',','.') price_list.append(each_price) #set seat_type seat_type_list = ['经济舱','超经济舱','公务舱'] seat_type = [] for i in range(len(price_list)): if i % 3 == 0: seat_type.append(seat_type_list[0]) elif i % 3 == 1: seat_type.append(seat_type_list[1]) else: seat_type.append(seat_type_list[2]) flight_no_l,dept_id_l,dest_id_l,dept_time_l,dest_time_l,stops_l = [],[],[],[],[],[] for j in range(len(stops_list)): for k in range(3): flight_no_l.append(flight_num_list[j]) dept_id_l.append(dept_id_list[j]) dest_id_l.append(dest_id_list[j]) dept_time_l.append(dept_time_list[j]) dest_time_l.append(dest_time_list[j]) stops_l.append(stops_list[j]) for i in range(len(price_list)): flight = Flight() flight.flight_no = flight_no_l[i] flight.plane_no = 'NULL' flight.airline = 'vueling' flight.dept_id = dept_id_l[i] flight.dest_id = dest_id_l[i] flight.dept_time = dept_time_l[i] flight.dest_time = dest_time_l[i] dept_time_c = str(dept_time_l[i]).replace('T',',').replace('-',',').replace(':',',').split(',') + [0,0,0] dept_time_t = date_handle(dept_time_c) dest_time_c = str(dest_time_l[i]).replace('T',',').replace('-',',').replace(':',',').split(',') + [0,0,0] dest_time_t = date_handle(dest_time_c) flight.dur = int(time.mktime(dest_time_t)) - int(time.mktime(dept_time_t)) flight.price = price_list[i] flight.dept_day = flight.dept_time[:10] flight.currency = 'EUR' flight.seat_type = seat_type[i] flight.source = 'vueling:vueling' flight.stop = stops_l[i] if flight.flight_no == flight_no and flight.dept_time == req_dept_time: return flight.price ''' flight_tuple = (flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, \ flight.dest_id, flight.dept_day, flight.dept_time, flight.dest_time, \ flight.dur, flight.price, flight.tax, flight.surcharge, flight.currency, \ flight.seat_type, flight.source, flight.return_rule, flight.stop) allinfos.append(flight_tuple) return allinfos ''' else: return -1
def vuelingparser(content, flight_no, req_dept_time): #allinfos = [] #get flight num flight_num_list = [] flight_num_info_temp = flight_no_pat.findall(content) if flight_num_info_temp != []: for flight_num_info in flight_num_info_temp: flight_num_temp_1 = flight_num_info.find('|') flight_num_temp_2 = flight_num_info.rfind('~^') if flight_num_temp_2 > 0: flight_num = flight_num_info[flight_num_temp_1+1:flight_num_temp_1+8]\ .replace('~','') + '_' + \ flight_num_info[flight_num_temp_2+2:flight_num_temp_2+9].replace('~','') else: flight_num = flight_num_info[flight_num_temp_1 + 1:flight_num_temp_1 + 8].replace( '~', '') flight_num_list.append(flight_num) #get station information #set station_temp,dept_id and dest_id pattern dept_id_list = [] dest_id_list = [] station_temp = station_temp_pat.findall(content) for station_temp_a in station_temp: station_info = station_temp_a.replace('\n', '').replace(' ', '') dept_id_num = station_info.find('):') dept_id = station_info[dept_id_num - 3:dept_id_num] dest_id_num = station_info.rfind(')') dest_id = station_info[dest_id_num - 3:dest_id_num] dept_id_list.append(dept_id) dest_id_list.append(dest_id) #get flight_time information #set dept_time,dest_time,flight_time pattern dept_time_list = [] dest_time_list = [] stops_list = [] flight_time_temp = flight_time_pat.findall(content) for time_temp in flight_time_temp: dept_time = dept_time_pat.findall(time_temp)[0] dest_time = dest_time_pat.findall(time_temp)[0] flight_num = flight_num_pat.findall(time_temp)[0] dept_time_list.append(dept_time) dest_time_list.append(dest_time) stops_list.append(flight_num) #get each kind flight price price_list = [] price_text = price_pat.findall(content) for price_temp in price_text: price_temp_num = price_temp.rfind('>') + 1 each_price = price_temp[price_temp_num:-3].replace(',', '.') price_list.append(each_price) #set seat_type seat_type_list = ['经济舱', '超经济舱', '公务舱'] seat_type = [] for i in range(len(price_list)): if i % 3 == 0: seat_type.append(seat_type_list[0]) elif i % 3 == 1: seat_type.append(seat_type_list[1]) else: seat_type.append(seat_type_list[2]) flight_no_l,dept_id_l,dest_id_l,dept_time_l,dest_time_l,stops_l = [],[],[],[],[],[] for j in range(len(stops_list)): for k in range(3): flight_no_l.append(flight_num_list[j]) dept_id_l.append(dept_id_list[j]) dest_id_l.append(dest_id_list[j]) dept_time_l.append(dept_time_list[j]) dest_time_l.append(dest_time_list[j]) stops_l.append(stops_list[j]) for i in range(len(price_list)): flight = Flight() flight.flight_no = flight_no_l[i] flight.plane_no = 'NULL' flight.airline = 'vueling' flight.dept_id = dept_id_l[i] flight.dest_id = dest_id_l[i] flight.dept_time = dept_time_l[i] flight.dest_time = dest_time_l[i] dept_time_c = str(dept_time_l[i]).replace('T', ',').replace( '-', ',').replace(':', ',').split(',') + [0, 0, 0] dept_time_t = date_handle(dept_time_c) dest_time_c = str(dest_time_l[i]).replace('T', ',').replace( '-', ',').replace(':', ',').split(',') + [0, 0, 0] dest_time_t = date_handle(dest_time_c) flight.dur = int(time.mktime(dest_time_t)) - int( time.mktime(dept_time_t)) flight.price = price_list[i] flight.dept_day = flight.dept_time[:10] flight.currency = 'EUR' flight.seat_type = seat_type[i] flight.source = 'vueling:vueling' flight.stop = stops_l[i] if flight.flight_no == flight_no and flight.dept_time == req_dept_time: return flight.price ''' flight_tuple = (flight.flight_no, flight.plane_no, flight.airline, flight.dept_id, \ flight.dest_id, flight.dept_day, flight.dept_time, flight.dest_time, \ flight.dur, flight.price, flight.tax, flight.surcharge, flight.currency, \ flight.seat_type, flight.source, flight.return_rule, flight.stop) allinfos.append(flight_tuple) return allinfos ''' else: return -1
def parsePage(content,dept_year, flight_no, orig_dept_time): result = -1 each_flight_content = each_flight_content_pat.findall(content) if len(each_flight_content) > 0: for each_flight_text in each_flight_content: flight = Flight() try: t_price = all_price_pat.findall(each_flight_text)[0] each_flight_text_temp = each_flight_content_temp_pat.findall(each_flight_text)[0] each_part_flight = each_part_flight_pat.findall(each_flight_text_temp) if len(each_part_flight) >= 1: flight.dept_id = airport_pat.findall(each_part_flight[0])[0][1:-1] flight.dest_id = airport_pat.findall(each_part_flight[-1])[-1][1:-1] dept_time_temp = dept_time_temp_pat.findall(each_part_flight[0])[0] dest_time_temp = dest_time_temp_pat.findall(each_part_flight[-1])[-1] flight.dept_day = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dept_time_temp[1].strip() flight.dept_time = flight.dept_day + 'T' + dept_time_temp[2].strip() + ':00' flight.dest_time = dept_year + '-' + dept_time_temp[0].strip() + '-' + \ dest_time_temp[0].strip() + 'T' + dest_time_temp[1].strip()[-5:] + ':00' dept_time = int(time.mktime(datetime.datetime.strptime(flight.dept_time, \ '%Y-%m-%dT%H:%M:%S').timetuple())) dest_time = int(time.mktime(datetime.datetime.strptime(flight.dest_time, \ '%Y-%m-%dT%H:%M:%S').timetuple())) flight.dur = dest_time - dept_time + 3600 flight.stop = len(each_part_flight) - 1 else: continue flight.price = price_pat.findall(each_flight_text)[0] if len(flight.price) > 1: flight.price = int(flight.price[0]) else: flight.price = int(t_price) try: flight.tax = int(t_price) - flight.price except: flight.tax = -1.0 logger.info('Can not parse tax info!') flight.flight_no = '' flight.airline = '' flight.plane_no = '' for each_flight_text_t in each_part_flight: flight.flight_no = flight.flight_no + flight_no_pat.findall(each_flight_text_t)[0][:8].replace(' ','') + '_' flight.plane_no = flight.plane_no + plane_no_pat.findall(each_flight_text_t)[0].replace(' ','') + '_' flight.airline = flight.airline + airline_pat.findall(each_flight_text_t)[0].replace(' ','') + '_' flight.flight_no = flight.flight_no[:-1] flight.plane_no = flight.plane_no[:-1] flight.airline = flight.airline[:-1] flight.return_rule = return_rule_pat.findall(each_flight_text)[0].replace('<p>','').replace('\n','') \ .replace('。','').replace('</p>','。').strip().replace(' ','') flight.currency = 'CNY' flight.source = 'feifan::feifan' flight.seat_type = '经济舱' if flight.flight_no == flight_no and flight.dept_time == orig_dept_time: result = flight.price break except Exception, e: continue