def from_string(cls, url): parsed_url = urlparse(url) path_parts = parsed_url.path.split('/')[1:-1] transaction, category, city, street = None, 'nieruchomosci', None, None for i, path_part in enumerate(path_parts): if path_part == 'nieruchomosci': pass elif path_part in POSSIBLE_TRANSACTIONS: transaction = path_part elif path_part in POSSIBLE_CATEGORIES: category = path_part elif not city: city = path_part else: street = path_part filters = {} query_params = parsed_url.query.split('ps')[1:] for i, query_param in enumerate(query_params): query_param, value = replace_all(query_param, { '%5B': '[', '%5D': ']', '&': '' }).split('=') filters[query_param] = value return cls(category, city, street, transaction, filters)
def get_city_for_offer(item, *args, **kwargs): """ Parse city information :param item: :param args: :param kwargs: :return: name of city :rtype: str """ nav = item.text.split('\n\n') return replace_all(nav[4], {' ': ''})
def get_voivodeship_for_offer(item, *args, **kwargs): """ Parse voivodeship information :param item: :param args: :param kwargs: :return: name of voivodeship :rtype: str """ nav = item.text.split('\n\n') return replace_all(nav[3], {' ': ''})
def encode_text_to_html(text): """ Change text to lower cases, gets rid of polish characters replacing them with simplified version, replaces spaces with dashes :param text: text to encode :type text: str :return: encoded text which can be used in url :rtype: str """ replace_dict = POLISH_CHARACTERS_MAPPING replace_dict.update({' ': '-'}) return replace_all(text.lower(), replace_dict)
def get_floor_for_offer(item, *args, **kwargs): """ Parse floor information :param item: :param args: :param kwargs: :return: number of floor :rtype: int """ if not item: return None floor_raw = item.find_parent('tr').find('td').text floor_sanitized = replace_all(floor_raw, {'\n': '', ' ': ''}).split('/')[0] floor = int(floor_sanitized) if floor_sanitized != 'parter' else 0 return floor
def get_offer_apartment_details(html_parser): """ This method returns detailed information about the apartment. :param html_parser: a BeautifulSoup object :rtype: dict :return: A dictionary full of details. """ raw_data = html_parser.find(class_="oferta") details_dict = {} replace_dict = {"\xa0": "", "Negocjuj cenę": "", "\n": ", "} while True: try: if raw_data.find_all("li"): item_list = raw_data.find_all("li") for detail in item_list: details_dict[detail.span.contents[0]] = replace_all( detail.div.text.strip("\n"), replace_dict) else: if raw_data.h4.contents[0] == "Opis dodatkowy": raw_data = raw_data.find_next_sibling("div") continue item_list = raw_data.find_all("p") for detail in item_list: if raw_data.h4.text not in details_dict: details_dict[raw_data.h4.contents[0]] = replace_all( detail.text.strip("\n"), replace_dict) else: details_dict[raw_data.h4.contents[0]] += replace_all( detail.text.strip("\n"), replace_dict) raw_data = raw_data.find_next_sibling("div") except AttributeError: break available_from_date = details_dict.get("Wolne od") if available_from_date: details_dict["Wolne od"] = parse_date_to_timestamp(available_from_date) return details_dict
def city_name(city): """ Creates valid OLX url city name OLX city name can't include polish characters, upper case letters. It also should replace white spaces with dashes. :param city: City name not in OLX url format :type city: str :return: Valid OLX url city name :rtype: str :Example: >> city_name("Ruda Śląska") "ruda-slaska" """ output = replace_all(city.lower(), POLISH_CHARACTERS_MAPPING).replace(" ", "-") if sys.version_info < (3, 3): return output.encode('utf-8') else: return output
def test_replace_all(text, dic, expected_value): assert utils.replace_all(text, dic) == expected_value
def get_offer_information(url, context=None): """ Scrape detailed information about an OtoDom offer. :param url: a string containing a link to the offer :param context: a dictionary(string, string) taken straight from the :meth:`scrape.category.get_category` :returns: A dictionary containing the scraped offer details """ # getting response response = get_response_for_url(url) content = response.content html_parser = BeautifulSoup(content, "html.parser") # getting meta values if context: cookie = get_cookie_from(response) try: csrf_token = get_csrf_token(content) offer_id = context['offer_id'] except AttributeError: csrf_token = '' offer_id = '' # getting offer details try: phone_numbers = get_offer_phone_numbers(offer_id, cookie, csrf_token) except KeyError: # offer was not present any more phone_numbers = [] phone_number_replace_dict = {u'\xa0': "", " ": "", "-": "", "+48": ""} phone_numbers = sum([ replace_all(num, phone_number_replace_dict).split(".") for num in phone_numbers ], []) else: cookie = "" csrf_token = "" phone_numbers = "" context = {} ninja_pv = get_offer_ninja_pv(content) result = { 'title': get_offer_title(html_parser), 'address': get_offer_address(html_parser), 'poster_name': get_offer_poster_name(html_parser), 'poster_type': ninja_pv.get("poster_type"), 'price': ninja_pv.get("ad_price"), 'currency': ninja_pv.get("price_currency"), 'city': ninja_pv.get("city_name"), 'district': ninja_pv.get("district_name", ""), 'voivodeship': ninja_pv.get("region_name"), 'geographical_coordinates': get_offer_geographical_coordinates(html_parser), 'phone_numbers': phone_numbers, 'description': get_offer_description(html_parser), 'offer_details': get_offer_details(html_parser), 'photo_links': get_offer_photos_links(html_parser), 'video_link': get_offer_video_link(html_parser), 'facebook_description': get_offer_facebook_description(html_parser), 'meta': { 'cookie': cookie, 'csrf_token': csrf_token, 'context': context } } flat_data = get_flat_data(html_parser, ninja_pv) if any(flat_data.values()): result.update(flat_data) return result