def read_ad_details(ad_id): ad = EstateAd.objects.get(pk=ad_id) # Now download ad if ad.link is None: return print "-- Detail: " + ad.link ad_html = get_site(ad.link) tree = pq(ad_html) gallery_links = tree.find('#galerija a') for link in gallery_links: try: image = AdPicture(picture_url=link.attrib["href"]) image.ad = ad image.save() except KeyError: # Missing href continue ad.description = tree.find('.web-opis').text() try: ad.administrative_unit = tree.find('.more_info').text().split( ' | ')[3].lstrip('Upravna enota:').strip() except IndexError: pass if ad.administrative_unit is None: ad.administrative_unit = "" try: ad.county = tree.find('.more_info').text().split(' | ')[4].lstrip( u'Ob\u010dina:').strip() except IndexError: pass if ad.county is None: ad.county = "" ad.raw_detail_html = ad_html ad.save()
def read_ad_details(ad_id): ad = EstateAd.objects.get(pk=ad_id) # Now download ad if ad.link is None: return print "-- Detail: " + ad.link ad_html = get_site(ad.link) tree = etree.fromstring(ad_html, etree.HTMLParser()) gallery_links = tree.xpath('//div[@id="galerija"]/a') for link in gallery_links: try: image = AdPicture(picture_url=link.attrib["href"]) image.ad = ad image.save() except KeyError: # Missing href continue ad.description = "\n".join(tree.xpath('//div[@class="web-opis"]//text()')) try: ad.administrative_unit = tree.xpath('//div[@class="main-data"]/table/tr')[3].getchildren()[1].text except IndexError: pass if ad.administrative_unit is None: ad.administrative_unit = "" try: ad.county = tree.xpath('//div[@class="main-data"]/table/tr')[4].getchildren()[1].text except IndexError: pass if ad.county is None: ad.county = "" ad.raw_detail_html = ad_html ad.save()
def read_ad_details(ad_id): ad = EstateAd.objects.get(pk=ad_id) # Now download ad if ad.link is None: return print "-- Detail: " + ad.link ad_html = get_site(ad.link) tree = pq(ad_html) gallery_links = tree.find('#galerija a') for link in gallery_links: try: image = AdPicture(picture_url=link.attrib["href"]) image.ad = ad image.save() except KeyError: # Missing href continue ad.description = tree.find('.web-opis').text() try: ad.administrative_unit = tree.find('.more_info').text().split(' | ')[3].lstrip('Upravna enota:').strip() except IndexError: pass if ad.administrative_unit is None: ad.administrative_unit = "" try: ad.county = tree.find('.more_info').text().split(' | ')[4].lstrip(u'Ob\u010dina:').strip() except IndexError: pass if ad.county is None: ad.county = "" ad.raw_detail_html = ad_html ad.save()
def handle(self, *args, **options): parse_queue = Queue() for region_num, region_name in REGIONS: print " == " + region_name + " == " parse_queue.put(TOP_SITE_URL + "&r=" + str(region_num)) while not parse_queue.empty(): url = parse_queue.get() print "Parsing " + url site_html = get_site(url) tree = pq(site_html) raw_ads = tree('.oglas_container') for raw_ad in raw_ads: doc = pq(raw_ad) ad_id = raw_ad.attrib["id"] if EstateAd.objects.filter(ad_id=ad_id).exists(): continue ad = EstateAd() ad.region = region_num ad.publish_date = timezone.now() # We're parsing last 24 hours so set publish date to now ad.ad_id = ad_id ad.title = doc.find('h2 a .title').text() ad.link = BASE_URL + doc.find('h2 a')[0].attrib["href"] data = doc.find('.main-data span') raw_data = {} for data_span in data: name = data_span.attrib["class"] value = data_span.text raw_data[name] = value raw_data['posr'] = doc.find('.posr').remove('.new-label').text() raw_attributes = doc.find('.atributi span') for raw_attribute in raw_attributes: name = raw_attribute.text[:raw_attribute.text.find(':')].lower() value = raw_attribute.find("strong").text raw_data[name] = value ad.raw_data = json.dumps(raw_data) ad.type, ad.building_type = self.parse_type(raw_data) ad.size_m2 = self.parse_size(raw_data) ad.price, ad.price_m2 = self.parse_price(raw_data, ad.size_m2) ad.year_built = self.parse_year(raw_data) ad.floor = raw_data.get("nadstropje", "") ad.short_description = doc.find('.kratek')[0].text ad.author_name = doc.find('.povezave div')[0].attrib["title"] ad.raw_html = etree.tostring(raw_ad) try: ad.save() except IntegrityError as e: print e print "Ad with id %s already exists in database!" % (ad.ad_id, ) _tasks.read_ad_details(ad.pk) # Grab next page link try: next_page_link = BASE_URL + tree.find('#pagination .next')[0].attrib["href"] parse_queue.put(next_page_link) except: pass
def handle(self, *args, **options): parse_queue = Queue() for region_num, region_name in REGIONS: print " == " + region_name + " == " parse_queue.put(TOP_SITE_URL + "&r=" + str(region_num)) while not parse_queue.empty(): url = parse_queue.get() print "Parsing " + url site_html = get_site(url) tree = etree.fromstring(site_html, etree.HTMLParser()) raw_ads = tree.xpath('//body/div/div/div[@id="content"]//div[@class="oglas_container"]') for raw_ad in raw_ads: ad_id = raw_ad.attrib["id"] if EstateAd.objects.filter(ad_id=ad_id).exists(): continue ad = EstateAd() ad.region = region_num ad.publish_date = timezone.now() # We're parsing last 24 hours so set publish date to now ad.ad_id = ad_id ad.title = raw_ad.xpath('div[@class="teksti_container"]/h2/a')[0].text ad.link = BASE_URL + raw_ad.xpath('div[@class="teksti_container"]/h2/a')[0].attrib["href"] data = raw_ad.xpath('div[@class="teksti_container"]/div[@class="main-data"]/span') raw_data = {} for data_span in data: name = data_span.attrib["class"] value = data_span.text raw_data[name] = value raw_attributes = raw_ad.xpath('div[@class="teksti_container"]/div[@class="atributi"]/span') for raw_attribute in raw_attributes: name = raw_attribute.text[:raw_attribute.text.find(':')].lower() value = raw_attribute.find("strong").text raw_data[name] = value ad.raw_data = json.dumps(raw_data) ad.type, ad.building_type = self.parse_type(raw_data) ad.size_m2 = self.parse_size(raw_data) ad.price, ad.price_m2 = self.parse_price(raw_data, ad.size_m2) ad.year_built = self.parse_year(raw_data) ad.floor = raw_data.get("nadstropje", "") ad.short_description = raw_ad.xpath('div[@class="teksti_container"]/div[@class="kratek"]')[0].text ad.author_name = raw_ad.xpath('div[@class="teksti_container"]/div[@class="povezave"]/div')[0].attrib["title"] ad.raw_html = etree.tostring(raw_ad) try: ad.save() except IntegrityError as e: print e print "Ad with id %s already exists in database!" % (ad.ad_id, ) _tasks.read_ad_details(ad.pk) # Grab next page link try: next_page_link = BASE_URL + tree.xpath('//div[@id="pagination" and @class="fr"]/ul/li/a[@class="next"]')[0].attrib["href"] parse_queue.put(next_page_link) except: pass
def handle(self, *args, **options): parse_queue = Queue() for region_num, region_name in REGIONS: print(" == " + region_name + " == ") parse_queue.put(TOP_SITE_URL + "&r=" + str(region_num)) while not parse_queue.empty(): url = parse_queue.get() print("Parsing " + url) site_html = get_site(url) tree = pq(site_html) raw_ads = tree('.oglas_container') for raw_ad in raw_ads: doc = pq(raw_ad) ad_id = raw_ad.attrib["id"] if EstateAd.objects.filter(ad_id=ad_id).exists(): continue ad = EstateAd() ad.region = region_num ad.publish_date = timezone.now() # We're parsing last 24 hours so set publish date to now ad.ad_id = ad_id ad.title = doc.find('h2 a .title').text() ad.link = BASE_URL + doc.find('h2 a')[0].attrib["href"] data = doc.find('.main-data span') raw_data = {} for data_span in data: name = data_span.attrib["class"] value = data_span.text raw_data[name] = value raw_data['posr'] = doc.find('.posr').remove('.new-label').text() raw_attributes = doc.find('.atributi span') for raw_attribute in raw_attributes: name = raw_attribute.text[:raw_attribute.text.find(':')].lower() strong_container = raw_attribute.find("strong") if strong_container is not None: value = raw_attribute.find("strong").text raw_data[name] = value ad.raw_data = json.dumps(raw_data) ad.type, ad.building_type = self.parse_type(raw_data) ad.size_m2 = self.parse_size(raw_data) ad.price, ad.price_m2 = self.parse_price(raw_data, ad.size_m2) ad.year_built = self.parse_year(raw_data) ad.floor = raw_data.get("nadstropje", "") ad.short_description = doc.find('.kratek')[0].text ad.author_name = doc.find('.povezave div')[0].attrib["title"] ad.raw_html = etree.tostring(raw_ad) try: ad.save() except IntegrityError as e: print(e) print("Ad with id %s already exists in database!" % (ad.ad_id, )) _tasks.read_ad_details(ad.pk) # Grab next page link try: next_page_link = BASE_URL + tree.find('#pagination .next')[0].attrib["href"] parse_queue.put(next_page_link) except: pass