def handle(self, *args, **options): """ Working process divided into some separate stages 1. Getting last date of Advert from DB. """ FULL_IMPORT = True DUPS_LIMIT = 10 __count = 0 __dups = 0 __page_dups = 0 control = None obj_buffer = [] obj_buffer_len = 10 obj_buffer_len = 100 if FULL_IMPORT: Advert.objects.all().delete() Advert._get_db().mongoengine.counters.remove({}) it = ads() while True: try: # it.next() # print control obj = it.send(control) control = None if obj == {}: print "EMPTY" continue if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys(): # print obj["url"] obj['kitchen_area'] = obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] del obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] if obj["images_len"] == 0: continue if (not "price" in obj.keys()) or obj["price"] == "": continue if Advert.objects(url=obj["url"]): # if Advert.objects(title=obj["title"], description=obj["description"]): __dups += 1 __page_dups += 1 if __page_dups >= DUPS_LIMIT and not FULL_IMPORT: control = "break_page_search" __page_dups = 0 else: try: prep_obj = _prepare(obj) print obj['url'] _ins = Advert(** prep_obj) if FULL_IMPORT: obj_buffer.append(_ins) if len(obj_buffer) >= obj_buffer_len: Advert.objects.insert(obj_buffer) obj_buffer = [] else: _ins.save(write_concern={'w':0, 'j':False, 'wtimeout':0}) if prep_obj['images_len'] > 0: for element in prep_obj["images"]: for jit in prep_obj["images"][element]: dl_image(jit) # import ipdb; ipdb.set_trace() except UnicodeEncodeError: pass # import ipdb; ipdb.set_trace() if __count % 100 == 0: print "count: %s\tdups: %s" % (__count, __dups) # print "count: %s\tdups: %s\t\r" % (__count, __dups), __count += 1 except StopIteration: it.close() break if len(obj_buffer) >0: Advert.objects.insert(obj_buffer) print
def handle(self, *args, **options): """ Working process divided into some separate stages 1. Getting last date of Advert from DB. """ FULL_IMPORT = True DUPS_LIMIT = 10 __count = 0 __dups = 0 __page_dups = 0 control = None obj_buffer = [] obj_buffer_len = 10 obj_buffer_len = 100 if FULL_IMPORT: Advert.objects.all().delete() Advert._get_db().mongoengine.counters.remove({}) it = ads() while True: try: # it.next() # print control obj = it.send(control) control = None if obj == {}: print "EMPTY" continue if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys( ): # print obj["url"] obj['kitchen_area'] = obj[ u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] del obj[ u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] if obj["images_len"] == 0: continue if (not "price" in obj.keys()) or obj["price"] == "": continue if Advert.objects(url=obj["url"]): # if Advert.objects(title=obj["title"], description=obj["description"]): __dups += 1 __page_dups += 1 if __page_dups >= DUPS_LIMIT and not FULL_IMPORT: control = "break_page_search" __page_dups = 0 else: try: prep_obj = _prepare(obj) print obj['url'] _ins = Advert(**prep_obj) if FULL_IMPORT: obj_buffer.append(_ins) if len(obj_buffer) >= obj_buffer_len: Advert.objects.insert(obj_buffer) obj_buffer = [] else: _ins.save(write_concern={ 'w': 0, 'j': False, 'wtimeout': 0 }) if prep_obj['images_len'] > 0: for element in prep_obj["images"]: for jit in prep_obj["images"][element]: dl_image(jit) # import ipdb; ipdb.set_trace() except UnicodeEncodeError: pass # import ipdb; ipdb.set_trace() if __count % 100 == 0: print "count: %s\tdups: %s" % (__count, __dups) # print "count: %s\tdups: %s\t\r" % (__count, __dups), __count += 1 except StopIteration: it.close() break if len(obj_buffer) > 0: Advert.objects.insert(obj_buffer) print
def handle(self, *args, **options): import sys sys.path.append("/home/bkmz/Dev/realty_parser/src") from analytics import insert as insert_irr mongo_objects = [] print "Start Truncating" # Ad.objects.all().delete() Advert.objects.all().delete() Advert._get_db().mongoengine.counters.remove({}) print "Truncating finished" COUNT = 0 for x in insert_irr(): # print x['url'] # try: # current_region = Region.objects.filter(name=x['region']).get() # except Region.DoesNotExist: # print "Region not found! Skip ad" # import ipdb; ipdb.set_trace() # continue # Advert(floor=2).save() if u"Адрес" in x.keys() and not x[u'Адрес'].strip() == "": # x['address'] = x[u'Адрес'] del x[u'Адрес'] # if x['address'] == "": # import ipdb; ipdb.set_trace() all = set(KEYS) all2 = set(VALUES) adv = set(x.keys()) old_keys = list((adv & all2)) converted_keys = [DICT2[xi] for xi in (adv & all2)] nonrel_adv = x for key1 in x.keys(): if key1 in DICT2: nonrel_adv[DICT2[key1]] = x[key1] del nonrel_adv[key1] # nonrel_adv['region'] = int(current_region.pk) nonrel_adv['region'] = x['region'].strip() ad_nonrel_obj = Advert(**nonrel_adv) # ad_nonrel_obj.save() mongo_objects.append(ad_nonrel_obj) # ad_nonrel_obj.save() print COUNT COUNT += 1 # if COUNT >= 1000: # break # print x['adding_date'] # import ipdb; ipdb.set_trace() # break Advert.objects.insert(mongo_objects) transaction.commit()