def handle(self, *args, **options): from django.utils import translation translation.activate('en') # with en-us everything crashes print(args) print(options) remove = options.get('delete', False) rebuild = False log = '------------------' for cat in Category.objects.all(): for sub_cat in cat.subcategory_set.all(): kws = {'sub_category': sub_cat, 'gkey__isnull': False, 'private': False} count = Ad.objects.filter(**kws).count() log += cat.name + ' - ' + sub_cat.name + ': ' + str(count) times = (count - 1000)/200 if times > 0 and remove: rebuild = True for i in range(times): to_remove = Ad.objects.filter(**kws).order_by('pub_date')[:200].values('id') Ad.objects.filter(pk__in=[it['id'] for it in to_remove]).delete() log += 'after remove' count = Ad.objects.filter(**kws).count() log += cat.name + ' - ' + sub_cat.name + ': ' + str(count) kws = {'category': cat, 'gkey__isnull': False, 'private': False} all_count = Ad.objects.filter(**kws).count() log += cat.name + ": " + str(all_count) log += '------------------' times = (all_count - 2000)/200 if times > 0 and remove: rebuild = True for i in range(times): to_remove = Ad.objects.filter(**kws).order_by('pub_date')[:200].values('id') Ad.objects.filter(pk__in=[it['id'] for it in to_remove]).delete() all_count = Ad.objects.filter(**kws).count() log += cat.name + ": " + str(all_count) send_log(log) if rebuild: call_command('rebuild_index', interactive=False)
def get_ads(cls): errors = 5 match = {'cena': 'price', 'datap': 'pub_date', 'dom_etag': 'floor_max', 'etag': 'floor', 'kol_komn': 'rooms_count', 'pl1': 'area_living', 'plk': 'area_kitchen', 'plosh': 'area', 'pl_land': 'area_land', 'textob': 'desc'} now = datetime.now().strftime("%Y-%m-%d") yesterday = (date.today() - timedelta(1)).strftime("%Y-%m-%d") data = { 're_base_name': 'kvart', 're_base_section': 'green', 'rem': 0, 'subq_start': 0, 'subq_lines': 200, 'order_by[data]': 'desc', 'order_by[datap]': 'desc', 'data_start': yesterday, 'data_end': now, } statistic = {} log = "" try: for table in ['arenda', 'kvart', 'komn', 'domm', 'negil', 'client']: # all tales data['re_base_name'] = table print('Load from ' + table) statistic[table] = {} for private_type in ['green']: #['green', 'red']: # all bases statistic[table][private_type] = 0 print('table type ' + private_type) data['re_base_section'] = private_type current = 0 total = 200 while current < total: # all pages print('select 200 starts from ' + str(current) + ', total is ' + str(total)) data['subq_start'] = current str_data = urllib.urlencode(data) content = cls.get('http://estate-in-kharkov.com/ps/re_base/ajax/real-estate-database.php', str_data) current += 200 if 're_base_query_count' in content: total = int(content['re_base_query_count']) if 'items' not in content and errors: print("hasn't content , errors left %d" % errors) errors -= 1 print(content) time.sleep((6-errors)*10) current -= 200 continue assert 'items' in content, 'unexpected content: ' + str(content) if content['items'] is False: break for item in content['items'].values(): aid = item['kod'] try: existed = Ad.objects.get(gkey=aid) print(str(existed) + ' already imported!') continue except Ad.DoesNotExist: pass ad_item = {'gkey': aid} for field in item: if item[field] and field in match: ad_item[match[field]] = item[field] ad = cls.parse_categories(ad_item, item, table) # continue if 'sub_category' in ad: print(ad['sub_category']) if 'rooms_count' in ad: print('rooms: ' + str(ad['rooms_count'])) if item['textob']: ad['title'] = truncatesmart(item['textob'], 45) if not ad['title']: ad['title'] = item['textob'][:45] ad['offering'] = table != 'client' ad['private'] = private_type == 'green' ad['phone'] = "" for i in range(1, 5): name = 'tel'+str(i) if name in item and item[name].strip(): if i > 1: ad['phone'] += ', ' ad['phone'] += item[name].strip() if item['metro']: try: metro = Metro.objects.get(pk=item['metro']) ad['desc'] += ' ' + metro.name except Metro.DoesNotExist: pass if item['ulica']: try: ad['address'] = Street.objects.get(pk=item['ulica']) except Street.DoesNotExist: pass if item['raj']: try: ad['district'] = District.objects.get(pk=item['raj']) except District.DoesNotExist: pass if item['nasp']: item['nasp'] = 1 if item['nasp'] == '293' else item['nasp'] try: ad['town'] = Town.objects.get(pk=item['nasp']) except Town.DoesNotExist: continue if ad['pub_date']: pub_date = dateutil.parser.parse(ad['pub_date']) if str(pub_date.time()) == "00:00:00": pub_date = datetime.combine(pub_date.date(), datetime.now().time()) pub_date_utc = pub_date.replace(tzinfo=timezone.get_current_timezone()) ad['pub_date'] = ad['order_date'] = pub_date_utc if item['fotosite'] and item['fotosite'].find('&have_images'): ad['url'] = item['fotosite'].replace('&have_images', '') ad_obj = Ad(**ad) ad_obj.save() statistic[table][private_type] += 1 find_similar.delay(ad_obj.id) if ad_obj.url: parsed_uri = urlparse(ad_obj.url) if parsed_uri.netloc in Crawler.DOMAINS: import_attachments.delay(ad_obj.id) print(ad_obj) time.sleep(1) except: log += "Error: " + traceback.format_exc() + "\n" raise finally: for name, stats in statistic.items(): log += name + ': ' if 'green' in stats: log += str(stats['green']) log += " / " if 'red' in stats: log += str(stats['red']) log += "\n" send_log(log)