def parse(self, response): hxs = HtmlXPathSelector(response) # flight_type: 0 - arrival; 1 - departure flight_type = 0 if response.request.url in self.start_urls[:2] else 1 items = [] flights = hxs.select('//table[@class="tablo tabloBigNew bigTableZebra"]/tr[position() != 1 and position() != 8]') for flight in flights[::2]: loader = TimetableLoader(item=TimetableItem(), selector=flight) loader.add_xpath('flight', 'td[1]//text()') loader.add_xpath('datetime_scheduled', 'td[3]//text()') loader.add_xpath('datetime_actual', 'td[4]//text()') loader.add_xpath('flight_status', 'td[5]//text()') loader.add_xpath('airline', 'td[6]//text()') city_airport = flight.select('td[2]//text()').extract()[0] city_airport = re.findall(r'[^\(\)]+', city_airport, re.U) if len(city_airport) == 2: city, airport = city_airport else: city, airport = city_airport[0], u'' if flight_type: loader.add_value('city_of_arrival', city) loader.add_value('airport_of_arrival', airport) loader.add_value('city_of_departure', u'Санкт-Петербург') loader.add_value('airport_of_departure', u'Пулково') else: loader.add_value('city_of_departure', city) loader.add_value('airport_of_departure', airport) loader.add_value('city_of_arrival', u'Санкт-Петербург') loader.add_value('airport_of_arrival', u'Пулково') loader.add_value('terminal', response.request.url[-1:].decode('utf-8')) loader.add_value('airport', u'LED') item = loader.load_item() yield item
def parse(self, response): hxs = HtmlXPathSelector(response) # flight_type: 0 - arrival; 1 - departure flight_type = 0 if response.request.url == self.start_urls[0] else 1 items = [] flights = hxs.select('//table[@id="TimeTable"]/tbody/tr') for flight in flights: loader = TimetableLoader(item=TimetableItem(), selector=flight) fields = ('flight', 'airline', 'airport_of_departure', 'airport_of_arrival', 'flight_status', 'datetime_scheduled', 'datetime_estimated', 'datetime_actual', 'terminal') for idx, field in enumerate(fields, start=1): loader.add_xpath(field, 'td[%s]//text()' % idx) fields = ('checkin_desk', 'comment') field_xpath, field_value = fields if flight_type else (fields[1], fields[0]) loader.add_xpath(field_xpath, 'td[10]//text()') loader.add_value('airport', u'VKO') item = loader.load_item() item[field_value] = u'' item['flight_type'] = flight_type city_airport_dict = {} for direction in ('departure', 'arrival'): city_airport = re.findall(r'[^\(\)]+', item['airport_of_%s' % direction], re.U) if len(city_airport) == 2: city, airport = city_airport else: city, airport = city_airport[0], u'' city_airport_dict[direction] = (city, airport) if flight_type: item['city_of_arrival'], item[ 'airport_of_arrival'] = city_airport_dict['arrival'] item['city_of_departure'], item[ 'airport_of_departure'] = u'Москва', u'Внуково' else: item['city_of_arrival'], item[ 'airport_of_arrival'] = u'Москва', u'Внуково' item['city_of_departure'], item[ 'airport_of_departure'] = city_airport_dict['departure'] #items.append(item) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) # flight_type: 0 - arrival; 1 - departure flight_type = 0 if response.request.url == self.start_urls[0] else 1 items = [] flights = hxs.select('//table[@id="TimeTable"]/tbody/tr') for flight in flights: loader = TimetableLoader(item=TimetableItem(), selector=flight) fields = ( "flight", "airline", "airport_of_departure", "airport_of_arrival", "flight_status", "datetime_scheduled", "datetime_estimated", "datetime_actual", "terminal", ) for idx, field in enumerate(fields, start=1): loader.add_xpath(field, "td[%s]//text()" % idx) fields = ("checkin_desk", "comment") field_xpath, field_value = fields if flight_type else (fields[1], fields[0]) loader.add_xpath(field_xpath, "td[10]//text()") loader.add_value("airport", u"VKO") item = loader.load_item() item[field_value] = u"" item["flight_type"] = flight_type city_airport_dict = {} for direction in ("departure", "arrival"): city_airport = re.findall(r"[^\(\)]+", item["airport_of_%s" % direction], re.U) if len(city_airport) == 2: city, airport = city_airport else: city, airport = city_airport[0], u"" city_airport_dict[direction] = (city, airport) if flight_type: item["city_of_arrival"], item["airport_of_arrival"] = city_airport_dict["arrival"] item["city_of_departure"], item["airport_of_departure"] = u"Москва", u"Внуково" else: item["city_of_arrival"], item["airport_of_arrival"] = u"Москва", u"Внуково" item["city_of_departure"], item["airport_of_departure"] = city_airport_dict["departure"] # items.append(item) yield item
def parse_main_contents(self, flight, response, flight_type): loader = TimetableLoader(item=TimetableItem(), selector=flight) loader.add_xpath('flight', 'td[1]//text()') loader.add_xpath('datetime_scheduled', 'td[3]//text()') loader.add_xpath('datetime_actual', 'td[4]//text()') loader.add_xpath('flight_status', 'td[6]//text()') loader.add_value('airport', u'DME') loader.add_value('flight_type', flight_type) loader.add_value('terminal', u'') item = loader.load_item() details = re.findall(r'\w+', flight.select('@onclick').extract()[0])[1] url = 'http://www.domodedovo.ru/ru/main/airindicator/detailsnew2.asp?id=%s' % details request = Request(url, callback=lambda r: self.parse_url_contents(r)) request.meta['item'] = item yield request
def parse_main_contents(self, flight, response, flight_type): loader = TimetableLoader(item=TimetableItem(), selector=flight) loader.add_xpath('flight', 'td[1]//text()') loader.add_xpath('datetime_scheduled', 'td[3]//text()') loader.add_xpath('datetime_actual', 'td[4]//text()') loader.add_xpath('flight_status', 'td[6]//text()') loader.add_value('airport', u'DME') loader.add_value('flight_type', flight_type) loader.add_value('terminal', u'') item = loader.load_item() details = re.findall(r'\w+', flight.select('@onclick').extract()[0])[1] url = 'http://www.domodedovo.ru/ru/main/airindicator/detailsnew2.asp?id=%s' % details request = Request(url, callback = lambda r: self.parse_url_contents(r)) request.meta['item'] = item yield request
def parse(self, response): hxs = HtmlXPathSelector(response) # flight_type: 0 - arrival; 1 - departure flight_type = 0 if response.request.url in self.start_urls[:2] else 1 items = [] flights = hxs.select( '//table[@class="tablo tabloBigNew bigTableZebra"]/tr[position() != 1 and position() != 8]' ) for flight in flights[::2]: loader = TimetableLoader(item=TimetableItem(), selector=flight) loader.add_xpath('flight', 'td[1]//text()') loader.add_xpath('datetime_scheduled', 'td[3]//text()') loader.add_xpath('datetime_actual', 'td[4]//text()') loader.add_xpath('flight_status', 'td[5]//text()') loader.add_xpath('airline', 'td[6]//text()') city_airport = flight.select('td[2]//text()').extract()[0] city_airport = re.findall(r'[^\(\)]+', city_airport, re.U) if len(city_airport) == 2: city, airport = city_airport else: city, airport = city_airport[0], u'' if flight_type: loader.add_value('city_of_arrival', city) loader.add_value('airport_of_arrival', airport) loader.add_value('city_of_departure', u'Санкт-Петербург') loader.add_value('airport_of_departure', u'Пулково') else: loader.add_value('city_of_departure', city) loader.add_value('airport_of_departure', airport) loader.add_value('city_of_arrival', u'Санкт-Петербург') loader.add_value('airport_of_arrival', u'Пулково') loader.add_value('terminal', response.request.url[-1:].decode('utf-8')) loader.add_value('airport', u'LED') item = loader.load_item() yield item
def parse_main_contents(self, flight, response): # flight_type: 0 - arrival; 1 - departure flight_type = flight.select('@class').extract()[0].split() flight_type = 0 if 'sA' in flight_type else 1 loader = TimetableLoader(item=TimetableItem(), selector=flight) loader.add_xpath('flight', 'td[2]//text()') loader.add_xpath('airline', 'td[3]//@alt') loader.add_xpath('city_of_departure' if flight_type else 'city_of_arrival', 'td[4]//text()') loader.add_xpath('flight_status', 'td[5]//text()') loader.add_xpath('datetime_scheduled', 'td[7]//text()') loader.add_xpath('datetime_estimated', 'td[8]//text()') loader.add_xpath('datetime_actual', 'td[9]//text()') loader.add_xpath('terminal', 'td[10]//text()') loader.add_value('airport', u'SVO') loader.add_value('city_of_arrival' if flight_type else 'city_of_departure', u'Москва') loader.add_value('airport_of_arrival' if flight_type else 'airport_of_departure', u'Шереметьево') item = loader.load_item() nowdate = datetime.date(datetime.now()) item['datetime_scheduled'] = item['datetime_scheduled'].replace( month=nowdate.month, day=nowdate.day) if item.get('datetime_estimated'): item['datetime_estimated'] = item['datetime_estimated'].replace( month=nowdate.month, day=nowdate.day) if item.get('datetime_actual'): item['datetime_actual'] = item['datetime_actual'].replace( month=nowdate.month, day=nowdate.day) item['flight_type'] = flight_type url = 'http://svo.aero%s' % (flight.select('td[2]//a/@href').extract()[0]) request = Request(url, callback = lambda r: self.parse_url_contents(r)) request.meta['item'] = item yield request