def __parse_time(self, item): start_times = map(lambda sub_item: normalize_text(PQ(sub_item).text()), item('td:eq(2) p') or item('td:eq(2)')) end_times = map(lambda sub_item: normalize_text(PQ(sub_item).text()), item('td:eq(3) p') or item('td:eq(3)')) break_times = map(lambda sub_item: normalize_text(PQ(sub_item).text()), item('td:eq(4) p') or item('td:eq(4)')) weekends = normalize_text(item('td:eq(5)').text()) time = u'' if len(start_times) == 1: sub_start_day, sub_start_time = self.__split_day_time(start_times[0]) sub_break_day, sub_break_time = self.__split_day_time(break_times[0]) if break_times[0] != u'без обеда' else (None, None) for end_time in end_times: sub_end_day, sub_end_time = self.__split_day_time(end_time) if sub_end_day: time += u'%s: %s-%s, ' % (sub_end_day, sub_start_time, sub_end_time) else: time += u'%s-%s, ' % (sub_start_time, sub_end_time) if sub_break_time: time += u'обед: %s, ' % sub_break_time else: for start_time in start_times: sub_start_day, sub_start_time = self.__split_day_time(start_time) for end_time in end_times: sub_end_day, sub_end_time = self.__split_day_time(end_time) if not sub_end_day or sub_start_day == sub_end_day: time += u'%s: %s-%s, ' % (sub_start_day, sub_start_time, sub_end_time) for break_time in break_times: sub_break_day, sub_break_time = self.__split_day_time(break_time) if break_time != u'без обеда' else (None, None) if sub_break_time and sub_start_day == sub_break_day: time += u'обед: %s, ' % sub_break_time if weekends != u'без выходных': time += u'%s: выходной' % weekends return normalize_time(time)
def __parse_base_atm_terminal(self, row, point_type, coordinates, deposit=False): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(u'№' + str(int(row[1]))) city = row[2] if u'р-н' not in row[2]: city = u'г. %s' % city point.address = normalize_address(u'%s, %s' % (city, row[3])) point.place = normalize_text(row[4]) if u'только безнал.платежи' in row[5]: point.currency = [] if deposit: point.deposit = False else: point.currency = map(strip, row[5].split(',')) if deposit: point.deposit = True point.time = normalize_time(row[6]) point.check_information = CHECK_OFFICIAL point.lat, point.lng = self.__get_point_coordinate(point.address, coordinates) if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL city = normalize_text(item('td:eq(0)').text()) address = normalize_text(item('td:eq(2)').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city.title(), address)) point.place = normalize_text(item('td:eq(1)').text()) point.time = normalize_time(item('td:eq(3)').text()) point.check_information = CHECK_OFFICIAL for lat, lng, type_id, description in self.__get_coordinates(): if u'Минск' not in point.address or type_id != '2': continue for token in description.split(): if token not in point.address: break else: point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __parse_base_office_exchange(self, item, point_type, name_keywords): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('th:eq(0) a:eq(0)').text()) if not point.name.startswith(name_keywords): return None city = normalize_text(item('td:eq(1)').text()) address = normalize_text(item('td:eq(2)').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city, address)) point.check_information = CHECK_OFFICIAL for lat, lng, type_id, description in self.__get_coordinates(): if u'Минск' not in point.address or type_id != '1': continue for token in description.split(): if token not in point.address and token not in point.name: break else: point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __get_offices(self, url, city_name=''): points = [] page = PQ(get_url(url).decode('utf8')) time = None for item in map(PQ, page('#oo__content_value table tr:gt(0)')): if item('td').attr('colspan') == '3': continue point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = normalize_text(item('td:eq(0)').text()) point.address = normalize_address(city_name + item('td:eq(1) p:eq(0)').text()) place = item('td:eq(1) p:eq(2)').text() if not place: place = item('td:eq(1) p:eq(1)').text() if place: point.place = normalize_text(place) new_time = item('td:eq(2)').text() if new_time: time = new_time point.time = normalize_time(time) point.check_information = CHECK_OFFICIAL if point.address in self.__addresses: point.lat, point.lng = self.__addresses[point.address] point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) points.append(point) return points
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL city = u'г. %s' % normalize_text(item('td:eq(0)').text()).title() point.address = normalize_address(u'%s, %s' % (city, item('td:eq(1)').text())) point.place = normalize_text(item('td:eq(2)').text()) point.time = normalize_time(item('td:eq(3)').text()) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_atm(self, item): point = Point() point.prov = self.uid point.type = TYPE_ATM city = u'г. %s' % normalize_text(item('td:eq(0)').text()).title() point.address = normalize_address(u'%s, %s' % (city, item('td:eq(1)').text())) point.place = normalize_text(item('td:eq(2)').text()) point.currency = map(strip, item('td:eq(3)').text().split(',')) point.time = normalize_time(item('td:eq(4)').text()) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_terminal(self, item): point = Point() point.prov = self.uid point.type = TYPE_TERMINAL point.name = normalize_text(item('td:eq(0)').text()) point.address, point.place = split_address_place(item('td:eq(1)').text()) point.place = point.name point.time = normalize_time(item('td:eq(2)').text()) point.deposit = normalize_text(item('td:eq(3)').text()).lower() == u'есть' point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __get_coordinates(self): if not self.__coordinates: tree = ET.fromstring(get_url(self.__markers_url)) for marker in tree.iter('marker'): lat = normalize_text(marker.attrib['lat']) lng = normalize_text(marker.attrib['lng']) address = normalize_address(marker.attrib['address']) for from_token, to_token in self.__address_replaces: address = address.replace(from_token, to_token) place = normalize_text(marker.attrib['place']) self.__coordinates.append((lat, lng, address, place)) return self.__coordinates
def __parse_base(self, item, city, point_type): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('.b-map-side>h5').text()) point.address, point.place = split_address_place(u'г. %s, %s' % (city, item('.b-map-side>p span:eq(0)').text())) coordinates = item('.b-map-side>p span:eq(1)').text() if coordinates: point.lat, point.lng = map(strip, coordinates.split(',')) text_html = replace_br(item('.b-map-side-more').html(), ';;;') time_items = [] for sub_item in map(normalize_text, PQ(text_html).text().split(';;;')): if not sub_item: continue if sub_item.startswith(u'Телефон:'): point.phones = normalize_phones(sub_item[len(u'Телефон:')].split(',')) continue time_items.append(sub_item) point.time = normalize_time(', '.join(time_items)) point.check_information = CHECK_OFFICIAL if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def __parse_exchange(self, item): if not normalize_text(item('td:eq(1)').text()).startswith(self.__exchange_keywords): return None point = self.__parse_base_office_exchange(item) point.type = TYPE_EXCHANGE warning_not_official_coordinates(point) return point
def __parse_office(self, item, city_name): point = self.__parse_base(item, city_name, TYPE_OFFICE) for sub_item in map(PQ, item('.content_table table tbody tr')): if normalize_text(sub_item('td:eq(0)').text()) == u'Кассы': time_items = [normalize_text(PQ(replace_br(sub_item('td:eq(1)').html(), ',')).text())] break_time = normalize_text(sub_item('td:eq(2)').text()) if break_time: time_items.append(u'перерыв: ' + break_time) day_off = normalize_text(sub_item('td:eq(3)').text()) if day_off: time_items.append(u'выходной: ' + day_off) point.time = normalize_time(', '.join(time_items)) break else: return None return point
def __parse_base(self, item, city_name, point_type): point = Point() point.prov = self.uid point.type = point_type point.phones = [normalize_phone(item('.content_table table tbody tr:eq(0) td:eq(0) .office_phone').remove().text())] name_address_html = replace_br(item('.content_table table tbody tr:eq(0) td:eq(0)').remove().html(), ',') name, address = PQ(name_address_html).text().split(',', 1) point.name = normalize_text(name) point.address, point.place = self.__get_address(city_name, address) point.check_information = CHECK_OFFICIAL script_text = item('.ya_map script:eq(1)').text() for line in map(strip, script_text.splitlines()): if line.startswith('BX_GMapAddPlacemark('): lat_token = "'LAT':'" lat_start_index = line.find(lat_token) + len(lat_token) lat_end_index = line.find("'", lat_start_index) point.lat = line[lat_start_index:lat_end_index] lng_token = "'LON':'" lng_start_index = line.find(lng_token) + len(lng_token) lng_end_index = line.find("'", lng_start_index) point.lng = line[lng_start_index:lng_end_index] point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def __parse_base_office_exchange(self, item, map_points, point_type, start_names): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('.name').text()) if not point.name.startswith(start_names): return None point.address, point.place = split_address_place(item('.addres strong').text()) sub_item = item('.item_block tr:last') point.phones = normalize_phones(sub_item('td:eq(0)').text().split(',')) mon_thu = u'пн-чт: ' + sub_item('td:eq(2)').text() fri = u'пт: ' + sub_item('td:eq(3)').text() sat = u'сб: ' + sub_item('td:eq(4)').text() sun = u'вс: ' + sub_item('td:eq(5)').text() point.time = normalize_time(', '.join([mon_thu, fri, sat, sun])) point.check_information = CHECK_OFFICIAL for lng, lat, name, address, place in map_points: if (point.name in name if point.name and name else True) and\ (point.address and address and point.address in address) and\ (point.place in place if point.place and place else True): point.lat = lat point.lng = lng point.check_coordinates = CHECK_OFFICIAL break else: warning_not_official_coordinates(point) return point
def get_offices(self): points = [] items_tree = ET.fromstring(get_url(self.__offices_xml_url)) for item in items_tree.iter('item'): point = self.__parse_office(item) if point: points.append(point) page = PQ(get_url(self.__regional_offices_page_url)) point = None for item in map(PQ, page('#content_internal span:eq(0)').children()): if item[0].tag not in self.__regional_offices_tags: continue if item[0].tag == 'h2': point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = trim_spaces_and_commas(normalize_text(item.text())) point.check_information = CHECK_OFFICIAL continue if not point: continue item_html = replace_br(item.html(), ';;;') sub_items = PQ(item_html).text().split(';;;') point.address, point.place = split_address_place(sub_items[0]) for sub_item in map(normalize_text, sub_items[1:]): if sub_item.startswith(u'т.ф.:'): point.phone = normalize_phones(sub_item[len(u'т.ф.:'):].split(',')) warning_not_official_coordinates(point) points.append(point) point = None return points
def __get_cities_ids(self): if not self.__cities_ids: page = PQ(get_url(self.__cities_url)) for item in map(PQ, page('.b-cities-list a *')): url = item.attr('id').replace('city_', '') city = normalize_text(item.text()) self.__cities_ids.append((url, city)) return self.__cities_ids
def __get_cities(self): if not self.__cities: page = PQ(get_url(self.__cities_url)) for item in map(PQ, page('#chooseCity .oblast a')): url = self.__cities_url + item.attr('href') city = normalize_text(item.text()) self.__cities.append((url, city)) return self.__cities
def __parse_base_office_exchange(self, item): point = Point() point.prov = self.uid point.name = normalize_text(item('td:eq(1)').text()) point.address, point.place = split_address_place(item('td:eq(2)').text()) point.time = normalize_time(item('td:eq(3)').text()) point.phones = normalize_phones(item('td:eq(4)').text().split(',')) point.check_information = CHECK_OFFICIAL return point
def __get_offices_urls(self): urls = [] page = PQ(get_url(self.__cities_url)) for item in map(PQ, page('#menuLeft>ul>li>span>a') + page('#menuLeft>ul>li>a')): if normalize_text(item.text()) == u'Банкоматы': continue url = self.site + item.attr('href') city = item.text() urls.append((url, city)) return urls
def __parse_exchange(self, item): point = Point() point.prov = self.uid point.type = TYPE_EXCHANGE sub_items = item.text().split(u'—') point.name = normalize_text(sub_items[0]) point.address, point.place = split_address_place(sub_items[1]) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_office(self, item, city): point = Point() point.prov = self.uid point.type = TYPE_OFFICE point.name = normalize_text(item('th .pointShowMaps span:eq(0)').text()) address = item('th .pointShowMaps span:eq(1)').text() point.address, point.place = split_address_place(u'г. %s, %s' % (city, address)) time_html = replace_br(item('td:eq(0)').html(), ', ') point.time = normalize_time(PQ(time_html).text()) phones_html = replace_br(item('td:eq(1)').html(), ', ') point.phones = normalize_phones(PQ(phones_html).text().split(',')) point.lat = normalize_text(item('th .item_coords .coord1').text()) point.lng = normalize_text(item('th .item_coords .coord2').text()) point.check_information = CHECK_OFFICIAL if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def __get_cities_info(self, url): if url not in self.__cities: cities = [] page = PQ(get_url(url)) for item in map(PQ, page('.content .selectForDropDownMenu select option')): city_id = item.attr('value') city_name = normalize_text(item.text()) cities.append((city_id, city_name,)) self.__cities[url] = cities return self.__cities[url]
def __parse_base(self, item): point = Point() point.prov = self.uid point.name = normalize_text(item("td:eq(0)").text()) point.address, point.place = self.__parse_address(item) more_url = self.site + item("td:eq(0) a").attr("href") more = PQ(get_url(more_url)) point.time = None point.phones = [] return point, more
def get_terminals(self): points = [] page = PQ(get_url(self.__parse_data_terminals_url)) for item in map(PQ, page('.tbl tr:gt(0)')): if item('td:eq(0)').attr('colspan') == '4' or not normalize_text(item.text()): continue point = self.__parse_terminal(item) if point: points.append(point) return points
def __parse_office(self, item): point, more = self.__parse_base(item) point.type = TYPE_OFFICE point.check_information = CHECK_OFFICIAL for section in map(PQ, more(".content .section")): section_type = normalize_text(section(".name").text()) if section_type not in self.__office_sections_types: continue section_value = section(".text") type = self.__office_sections_types[section_type] if type == "time": time_html = replace_br(section_value.html(), ",") time_text = ", ".join([item.text() for item in map(PQ, PQ(time_html)("td"))]) point.time = normalize_time(time_text) elif type == "phone": phones_html = replace_br(section_value.html(), ",") phones_text = normalize_text(PQ(phones_html).text()) point.phones = filter_empty(map(normalize_phone, phones_text.split(","))) warning_not_official_coordinates(point) return point
def __parse_atm(self, item): point = Point() point.prov = self.uid point.type = TYPE_ATM point.name = normalize_text(item('td:eq(0)').text()) point.address, point.place = split_address_place(item('td:eq(1)').text()) point.place = point.name point.time = normalize_time(item('td:eq(2)').text()) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def get_exchanges(self): points = [] page = PQ(get_url(self.__parse_list_exchange_url).decode('utf8')) for item in map(PQ, page('#oo__content_value table tr:gt(0)')): point = Point() point.prov = self.uid point.type = TYPE_EXCHANGE add_city_literal = (u'Минск', u'Витебск') address = normalize_text(item('td:eq(0)').text()) point.address = normalize_address((u'г. ' + address) if address.startswith(add_city_literal) else address) point.place = normalize_text(item('td:eq(1)').text()) point.time = normalize_time(item('td:eq(2)').text()) point.check_information = CHECK_OFFICIAL if point.address in self.__addresses: point.lat, point.lng = self.__addresses[point.address] point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) points.append(point) return points
def __parse_base_office_exchange(self, item, point_type): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('h2').text()) point.address = normalize_address(item('.itemFilialIn>p:eq(0)').text()[len(u'Почтовый адрес:') + 1:]) is_phone = False phones_items = self.__phone_splitter.split(item('.itemFilialIn>p:eq(1)').text() or '') for sub_item in phones_items: sub_item = normalize_text(sub_item).lower() if sub_item == u'телефон': is_phone = True continue if sub_item == u'факс': is_phone = False continue if is_phone: point.phones.append(normalize_phone(sub_item)) point.time = normalize_time(', '.join(map(lambda sub_item: PQ(sub_item).text(), item('.workTime p')))) point.check_information = CHECK_OFFICIAL warning_not_official_coordinates(point) return point
def __parse_office_exchange(self, item, city, coordinates, point_type, point_keywords): point = Point() point.prov = self.uid point.type = point_type point.name = normalize_text(item('th:eq(0)').text()) if not point.name.startswith(point_keywords): return None address_html = replace_br(item('td:eq(0)').html(), ';;;') address_items = PQ(address_html).text().split(';;;', 1) point.address = normalize_address(u'%s, %s' % (city, address_items[0])) if len(address_items) > 1: point.place = normalize_text(address_items[1]) item('td:eq(1) ul, td:eq(1) li').remove() point.time = normalize_time(item('td:eq(1)').text()) point.phones = normalize_phones(map(lambda phone_item: PQ(phone_item).text(), item('td:eq(2) p') or item('td:eq(2)'))) point.check_information = CHECK_OFFICIAL point.lat, point.lng = self.__get_point_coordinate(point, coordinates) if point.lat and point.lng: point.check_coordinates = CHECK_OFFICIAL else: warning_not_official_coordinates(point) return point
def __get_offices_or_exchanges(self, parse_item, coordinates): points = [] page = PQ(get_url(self.__parse_data_offices_exchanges_url)) city = None for city_item in map(PQ, page('.b-ugc>*:gt(0)')): if city_item[0].tag == 'h2': city = normalize_text(city_item.text()) continue for item in map(PQ, city_item('tr:gt(0)')): point = parse_item(item, city, coordinates) if point: points.append(point) return points