def test_create_ads(self): tag1 = Tag.objects.create(name="HTML 5", slug="html5") tag2 = Tag.objects.create(name="jQuery", slug="jquery") tag3 = Tag.objects.create(name="Python", slug="python") self.assertEquals(0, Ad.objects.count()) ad1 = Ad() ad1.company_name = "ACME International" ad1.company_url = "http://acme.com/" ad1.title = "Ninja Top Developer" ad1.description = """ We are growing, we need more developers. Apply if you are cool and sexy. """ ad1.apply_email = "*****@*****.**" ad1.full_clean() ad1.save() self.assertEquals(1, Ad.objects.count()) ad1.tags.add(tag1) self.assertEquals(1, tag1.ads.count()) tag2.ads.add(ad1) self.assertEquals(2, ad1.tags.count()) self.assertEquals(0, tag3.ads.count())
def setUp(self): newspaper1 = Newspaper(name='Test Newspaper 1') newspaper1.save() ad = Ad() ad.name = 'Name 1' ad.content = 'Content 1' ad.save()
def post(self, request, pk): print("Add PK", pk) t = get_object_or_404(Ad, id=pk) ad = Ad(user=request.user, ad=t) try: ad.save() # In case of duplicate key except IntegrityError as e: pass return HttpResponse()
def setUp(self): ad1 = Ad() ad1.name = 'Test Ad 1' ad1.content = 'Content 1' ad1.save() ad2 = Ad() ad2.name = 'Test Ad 2' ad2.content = 'Content 2' ad2.save()
def setUp(self): newspaper1 = Newspaper(name='Test Newspaper 1') newspaper1.save() newspaper2 = Newspaper(name='Test Newspaper 2') newspaper2.save() ad = Ad() ad.name = 'Name 1' ad.content = 'Content 1' ad.save() ad.newspapers.add(newspaper1, newspaper2) ad.save()
def setUp(self): newspaper1 = Newspaper() newspaper1.name = 'Test Newspaper 1' newspaper1.save() newspaper2 = Newspaper() newspaper2.name = 'Test Newspaper 2' newspaper2.save() newspaper3 = Newspaper() newspaper3.name = 'Test Newspaper 3' newspaper3.save() ad1 = Ad() ad1.name = 'Test Ad 1' ad1.id = 1 ad1.content = 'Content 1' ad1.save() ad2 = Ad() ad2.name = 'Test Ad 2' ad2.id = 2 ad2.content = 'Content 2' ad2.save() ad2.newspapers.add(newspaper1, newspaper2) ad2.save() ad3 = Ad() ad2.id = 3 ad3.name = 'Test Ad 3' ad3.content = 'Content 3' ad3.save() self.browser = webdriver.Firefox() self.browser.implicitly_wait(3)
def setUp(self): newspaper1 = Newspaper(name='Test Newspaper 1') newspaper1.save() newspaper2 = Newspaper(name='Test Newspaper 2') newspaper2.save() newspaper3 = Newspaper(name='Test Newspaper 3') newspaper3.save() first_ad = Ad() first_ad.name = 'Name 1' first_ad.content = 'Content 1' first_ad.save() first_ad.newspapers.add(newspaper1, newspaper2) first_ad.save() second_ad = Ad() second_ad.name = 'Name 2' second_ad.content = 'Content 2' second_ad.save() second_ad.newspapers.add(newspaper2, newspaper3) first_ad.save()
def setUp(self): self.browser = webdriver.Firefox() self.browser.implicitly_wait(3) ad1 = Ad() ad1.name = 'Test Ad 1' ad1.content = 'Content 1' ad1.save() ad2 = Ad() ad2.name = 'Test Ad 2' ad2.content = 'Content 2' ad2.save() ad3 = Ad() ad3.name = 'Test Ad 3' ad3.content = 'Content 3' ad3.save()
def extract_data(ad_urls,uuid): result = [] attribute_dict = { 'Date Listed' : 'date' , 'Price' : 'price' , 'Address' : 'address', 'Bathrooms (#)' : 'bathrooms' , 'Furnished' : 'furnished' , 'Pet Friendly' : 'pet_friendly' } for ad_url in ad_urls : try: items = dict() #empty dict used to create the db object items['url'] = ad_url #get ad data ad_soup = bs(opener.open(ad_url).read()) map_link = '' #title title = ad_soup.find('h1',id='preview-local-title').getText().replace('google_ad_section_start','').replace('google_ad_section_end','') items['title'] = title #table data for tr in ad_soup.find('table' , id='attributeTable').findAll('tr') : for td in tr.findAll('td') : key = td.getText() if key in attribute_dict : value = td.findNext('td').getText() if attribute_dict.get(key) == 'date' : items['pub_date'] = datetime.datetime.strptime(value,"%d-%b-%y") if attribute_dict.get(key) == 'bathrooms' : items['bathrooms'] = float(re.match('(\d+.?\d?) bathroom.*',value).group(1)) if attribute_dict.get(key) == 'price' : if value.find('contact') > 0 : items['rent'] = 0 else : items['rent'] = int(float(value[1:].replace(',',''))) if attribute_dict.get(key) == 'address' : items['address'] = value.replace('View map','') if attribute_dict.get(key) == 'furnished' or attribute_dict.get(key) == 'pet_friendly' : if value == 'No': items[attribute_dict.get(key)] = False else: items[attribute_dict.get(key)]=True # map coordinates map_url = 'http://montreal.kijiji.ca' + ad_soup.find('a', attrs = { 'class' : 'viewmap-link' } ).get('href') map_soup = bs(opener.open(map_url).read()) for noscript in map_soup.findAll('noscript') : if noscript.find('img') : map_link = noscript.find('img').get('src') coords = urllib2.urlparse.parse_qs(urllib2.urlparse.urlparse(map_link).query) lat_lng = coords['center'][0].split(',') items['lat'] = float(lat_lng[0]) items['lng'] = float(lat_lng[1]) print items ad = Ad(**items) ad.save() result.append(ad) except Exception as e: print "[FAILED]" , ad_url print e pass #skip to the next one cache_obj = cache.get(uuid) new_obj = { 'total' : cache_obj['total'] , 'found' : cache_obj['found'] + 1 } cache.set(uuid,new_obj) return result
def import_ads(file_id): imported_file = ImportFile.objects.get(id=file_id) clear_prev_ads(imported_file) with PersistentDict(imported_file.settings.path, 'w', format='json') as params: #with closing(shelve.open(file_path)) as params: ids = [] count = len(params['lines']) i = 0 for line in params['lines']: ad = {} for field in params['matching']: if params['matching'][field] != 'Constant' and 'Constant' not in params['matching'][field]: poss = params['matching'][field] if type(params['matching'][field]) is list else [params['matching'][field]] value = "" for pos in poss: pos = int(pos) if len(line) < pos or not line[pos].strip(): continue value += ", " + line[pos] if value else line[pos] if not value: continue if field in CompareFieldsForm.compared_fields: curr_comp_field = CompareFieldsForm.compared_fields[field] compared_obj = curr_comp_field['model'].objects.get(pk=value) if field == 'category': if compared_obj.sub_category is not None: ad['category'] = compared_obj.sub_category.category ad['sub_category'] = compared_obj.sub_category else: ad['category'] = compared_obj.category elif field == 'district': if compared_obj.district: ad['district'] = compared_obj.district ad['town'] = compared_obj.district.town else: ad['town'] = compared_obj.town else: ad[field] = compared_obj.value elif field == 'pub_date': pub_date = dateutil.parser.parse(value) if str(pub_date.time()) == "00:00:00": pub_date = datetime.combine(pub_date.date(), datetime.now().time()) ad[field] = ad['order_date'] = pub_date else: ad[field] = Ad.filter_field(value, field) elif 'constants' in params and field in params['constants']: # constant value = Ad.filter_field(str(params['constants'][field]), field) if field in CompareFieldsForm.compared_fields: if field == 'category': cat_values = value.split('-') if len(cat_values) > 1: sub_cat = SubCategory.objects.get(pk=cat_values[1]) ad['category'] = sub_cat.category ad['sub_category'] = sub_cat else: cat = Category.objects.get(pk=cat_values[0]) ad['category'] = cat elif 'foreign' in CompareFieldsForm.compared_fields[field]: ad[field] = CompareFieldsForm.compared_fields[field]['foreign'].objects.get(pk=value) else: for key, val in CompareFieldsForm.compared_fields[field]['choices']: if str(key) == value: ad[field] = val else: ad[field] = value empty = set(MatchFieldsForm.required) - set(ad.keys()) if not empty: ad_obj = Ad(**ad) ad_obj.imported = imported_file ad_obj.save() find_similar.delay(ad_obj.id) if ad_obj.url: parsed_uri = urlparse(ad_obj.url) if parsed_uri.netloc in Crawler.DOMAINS: import_attachments.delay(ad_obj.id) ids.append(ad_obj.id) i += 1 celery.current_task.update_state(state='PROGRESS', meta={'current': i, 'total': count}) params['ids'] = ids return ids
def get_ads(cls): errors = 5 match = {'cena': 'price', 'datap': 'pub_date', 'dom_etag': 'floor_max', 'etag': 'floor', 'kol_komn': 'rooms_count', 'pl1': 'area_living', 'plk': 'area_kitchen', 'plosh': 'area', 'pl_land': 'area_land', 'textob': 'desc'} now = datetime.now().strftime("%Y-%m-%d") yesterday = (date.today() - timedelta(1)).strftime("%Y-%m-%d") data = { 're_base_name': 'kvart', 're_base_section': 'green', 'rem': 0, 'subq_start': 0, 'subq_lines': 200, 'order_by[data]': 'desc', 'order_by[datap]': 'desc', 'data_start': yesterday, 'data_end': now, } statistic = {} log = "" try: for table in ['arenda', 'kvart', 'komn', 'domm', 'negil', 'client']: # all tales data['re_base_name'] = table print('Load from ' + table) statistic[table] = {} for private_type in ['green']: #['green', 'red']: # all bases statistic[table][private_type] = 0 print('table type ' + private_type) data['re_base_section'] = private_type current = 0 total = 200 while current < total: # all pages print('select 200 starts from ' + str(current) + ', total is ' + str(total)) data['subq_start'] = current str_data = urllib.urlencode(data) content = cls.get('http://estate-in-kharkov.com/ps/re_base/ajax/real-estate-database.php', str_data) current += 200 if 're_base_query_count' in content: total = int(content['re_base_query_count']) if 'items' not in content and errors: print("hasn't content , errors left %d" % errors) errors -= 1 print(content) time.sleep((6-errors)*10) current -= 200 continue assert 'items' in content, 'unexpected content: ' + str(content) if content['items'] is False: break for item in content['items'].values(): aid = item['kod'] try: existed = Ad.objects.get(gkey=aid) print(str(existed) + ' already imported!') continue except Ad.DoesNotExist: pass ad_item = {'gkey': aid} for field in item: if item[field] and field in match: ad_item[match[field]] = item[field] ad = cls.parse_categories(ad_item, item, table) # continue if 'sub_category' in ad: print(ad['sub_category']) if 'rooms_count' in ad: print('rooms: ' + str(ad['rooms_count'])) if item['textob']: ad['title'] = truncatesmart(item['textob'], 45) if not ad['title']: ad['title'] = item['textob'][:45] ad['offering'] = table != 'client' ad['private'] = private_type == 'green' ad['phone'] = "" for i in range(1, 5): name = 'tel'+str(i) if name in item and item[name].strip(): if i > 1: ad['phone'] += ', ' ad['phone'] += item[name].strip() if item['metro']: try: metro = Metro.objects.get(pk=item['metro']) ad['desc'] += ' ' + metro.name except Metro.DoesNotExist: pass if item['ulica']: try: ad['address'] = Street.objects.get(pk=item['ulica']) except Street.DoesNotExist: pass if item['raj']: try: ad['district'] = District.objects.get(pk=item['raj']) except District.DoesNotExist: pass if item['nasp']: item['nasp'] = 1 if item['nasp'] == '293' else item['nasp'] try: ad['town'] = Town.objects.get(pk=item['nasp']) except Town.DoesNotExist: continue if ad['pub_date']: pub_date = dateutil.parser.parse(ad['pub_date']) if str(pub_date.time()) == "00:00:00": pub_date = datetime.combine(pub_date.date(), datetime.now().time()) pub_date_utc = pub_date.replace(tzinfo=timezone.get_current_timezone()) ad['pub_date'] = ad['order_date'] = pub_date_utc if item['fotosite'] and item['fotosite'].find('&have_images'): ad['url'] = item['fotosite'].replace('&have_images', '') ad_obj = Ad(**ad) ad_obj.save() statistic[table][private_type] += 1 find_similar.delay(ad_obj.id) if ad_obj.url: parsed_uri = urlparse(ad_obj.url) if parsed_uri.netloc in Crawler.DOMAINS: import_attachments.delay(ad_obj.id) print(ad_obj) time.sleep(1) except: log += "Error: " + traceback.format_exc() + "\n" raise finally: for name, stats in statistic.items(): log += name + ': ' if 'green' in stats: log += str(stats['green']) log += " / " if 'red' in stats: log += str(stats['red']) log += "\n" send_log(log)