def handle(self, *args, **options): """ Working process divided into some separate stages 1. Getting last date of Advert from DB. """ FULL_IMPORT = True DUPS_LIMIT = 10 __count = 0 __dups = 0 __page_dups = 0 control = None obj_buffer = [] obj_buffer_len = 10 obj_buffer_len = 100 if FULL_IMPORT: Advert.objects.all().delete() Advert._get_db().mongoengine.counters.remove({}) it = ads() while True: try: # it.next() # print control obj = it.send(control) control = None if obj == {}: print "EMPTY" continue if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys( ): # print obj["url"] obj['kitchen_area'] = obj[ u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] del obj[ u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] if obj["images_len"] == 0: continue if (not "price" in obj.keys()) or obj["price"] == "": continue if Advert.objects(url=obj["url"]): # if Advert.objects(title=obj["title"], description=obj["description"]): __dups += 1 __page_dups += 1 if __page_dups >= DUPS_LIMIT and not FULL_IMPORT: control = "break_page_search" __page_dups = 0 else: try: prep_obj = _prepare(obj) print obj['url'] _ins = Advert(**prep_obj) if FULL_IMPORT: obj_buffer.append(_ins) if len(obj_buffer) >= obj_buffer_len: Advert.objects.insert(obj_buffer) obj_buffer = [] else: _ins.save(write_concern={ 'w': 0, 'j': False, 'wtimeout': 0 }) if prep_obj['images_len'] > 0: for element in prep_obj["images"]: for jit in prep_obj["images"][element]: dl_image(jit) # import ipdb; ipdb.set_trace() except UnicodeEncodeError: pass # import ipdb; ipdb.set_trace() if __count % 100 == 0: print "count: %s\tdups: %s" % (__count, __dups) # print "count: %s\tdups: %s\t\r" % (__count, __dups), __count += 1 except StopIteration: it.close() break if len(obj_buffer) > 0: Advert.objects.insert(obj_buffer) print
def handle(self, *args, **options): """ Working process divided into some separate stages 1. Getting last date of Advert from DB. """ FULL_IMPORT = True DUPS_LIMIT = 10 __count = 0 __dups = 0 __page_dups = 0 control = None obj_buffer = [] obj_buffer_len = 10 obj_buffer_len = 100 if FULL_IMPORT: Advert.objects.all().delete() Advert._get_db().mongoengine.counters.remove({}) it = ads() while True: try: # it.next() # print control obj = it.send(control) control = None if obj == {}: print "EMPTY" continue if u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438' in obj.keys(): # print obj["url"] obj['kitchen_area'] = obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] del obj[u'\u041f\u043b\u043e\u0449 \u043a\u0443\u0445\u043d\u0438'] if obj["images_len"] == 0: continue if (not "price" in obj.keys()) or obj["price"] == "": continue if Advert.objects(url=obj["url"]): # if Advert.objects(title=obj["title"], description=obj["description"]): __dups += 1 __page_dups += 1 if __page_dups >= DUPS_LIMIT and not FULL_IMPORT: control = "break_page_search" __page_dups = 0 else: try: prep_obj = _prepare(obj) print obj['url'] _ins = Advert(** prep_obj) if FULL_IMPORT: obj_buffer.append(_ins) if len(obj_buffer) >= obj_buffer_len: Advert.objects.insert(obj_buffer) obj_buffer = [] else: _ins.save(write_concern={'w':0, 'j':False, 'wtimeout':0}) if prep_obj['images_len'] > 0: for element in prep_obj["images"]: for jit in prep_obj["images"][element]: dl_image(jit) # import ipdb; ipdb.set_trace() except UnicodeEncodeError: pass # import ipdb; ipdb.set_trace() if __count % 100 == 0: print "count: %s\tdups: %s" % (__count, __dups) # print "count: %s\tdups: %s\t\r" % (__count, __dups), __count += 1 except StopIteration: it.close() break if len(obj_buffer) >0: Advert.objects.insert(obj_buffer) print
def handle(self, *args, **options): # Advert.objects.all().delete() # Advert._get_db().mongoengine.counters.remove({}) f = open('./csv_files/base_kv_utf.csv') r = csv.DictReader(f, delimiter=";") # curs = Currency.objects.all() # adverts = Advert.objects({}) # return for row in r: del_keys = [] row_obj = {} for key in row: if key != 'id' and not isinstance(key, unicode): try: key = key.encode('utf-8') except UnicodeDecodeError: continue if key not in ['terms_of_sale', 'auction', 'lift', 'kitchen_area', 'total_area', 'living_area', 'year_built', 'floor', 'number_of_floors', 'price', 'region', 'city', 'microregion'] \ and not 'image' in key and 'description' not in key: row_obj[key] = unicode(row[key].decode('utf-8')) elif key == 'auction': if unicode(row[key].decode('utf-8')) == u'Да': row_obj[key] = True else: row_obj[key] = False elif key == 'microregion': value = unicode(row[key].decode('utf-8')) row_obj[key] = value row_obj['region2'] = value elif key =='terms_of_sale': value = unicode(row[key].decode('utf-8')) if u'аренда' in value: row_obj['action_type'] = 'rent' elif u'обмен' in value: row_obj['action_type'] = 'exchange' else: row_obj['action_type'] = 'sale' elif key == 'lift': if row[key] is not None and row[key] != '': row_obj[key] = True else: row_obj[key] = False elif key in ['year_built', 'floor', 'number_of_floors']: if row[key] is not None and row[key] != '': try: row_obj[key] = int(row[key]) except ValueError: continue elif key in ['kitchen_area', 'total_area', 'living_area', 'price']: if row[key] is not None and row[key] != '': try: row_obj[key] = float(row[key].replace(',', '.')) except ValueError: continue elif 'image' in key: if row_obj.get('images') is None: row_obj['images'] = {'medium': [], 'thumbs': [], 'original': []} row_obj['images_len'] = 0 if isinstance(row[key], list): for image in row[key]: if image != '': images_arr = image.split(',') for img in images_arr: row_obj['images']['medium'].append(img) row_obj['images']['thumbs'].append(img) row_obj['images']['original'].append(img) row_obj['images_len'] += 1 else: if row[key] != '': images_arr = row[key].split(',') for img in images_arr: row_obj['images']['medium'].append(img) row_obj['images']['thumbs'].append(img) row_obj['images']['original'].append(img) row_obj['images_len'] += 1 elif 'description' in key: if row_obj.get('description') is None: row_obj['description'] = '' row_obj['description'] += row[key] elif key in ['region', 'city']: if key == 'city': value = unicode(row[key].decode('utf-8')) for city in CITIES: if city in value: row_obj['region'] = value break if row_obj.get('region') is None: row_obj['region'] = unicode(row['region'].decode('utf-8')) + u' область' row_obj[key] = u'г. ' + value # elif key == 'house': # if row_obj.get('address') is not None: # row_obj['address'] += u' ' + unicode(row[key].decode('utf-8')) # else: # row_obj[key] = row[key] # elif key == 'address': # if row_obj.get(key) is None: # row_obj[key] = unicode(row[key].decode('utf-8')) # row_obj[key] += u' ' + unicode(row['house'].decode('utf-8')) row_obj['currency'] = 'usd' row_obj['current_status'] = 'vip_normal' row_obj['group'] = 'living' row_obj['cat_tab'] = 'flat' row_obj['cat_type'] = 'flat' adv = Advert(**row_obj) adv.save(write_concern={'w':0, 'j':False, 'wtimeout':0}) # a = Advert.objects(price__exists=True, images_len__gt=1, # region__icontains=u'Гомель', # action_type__contains='sale', # group='living', cat_type_in='flat') a = Advert.objects(price__exists=True, images_len__gt=1, region__icontains=u'Гомель', action_type__contains='sale', group='living', cat_type__in=['flat']) adv_im = [] for adv in a: images = adv.get('images') if images is not None and len(images)>1: adv_im.append(adv) print a
def handle(self, *args, **options): # Advert.objects.all().delete() # Advert._get_db().mongoengine.counters.remove({}) f = open('./csv_files/base_kv_utf.csv') r = csv.DictReader(f, delimiter=";") # curs = Currency.objects.all() # adverts = Advert.objects({}) # return for row in r: del_keys = [] row_obj = {} for key in row: if key != 'id' and not isinstance(key, unicode): try: key = key.encode('utf-8') except UnicodeDecodeError: continue if key not in ['terms_of_sale', 'auction', 'lift', 'kitchen_area', 'total_area', 'living_area', 'year_built', 'floor', 'number_of_floors', 'price', 'region', 'city', 'microregion'] \ and not 'image' in key and 'description' not in key: row_obj[key] = unicode(row[key].decode('utf-8')) elif key == 'auction': if unicode(row[key].decode('utf-8')) == u'Да': row_obj[key] = True else: row_obj[key] = False elif key == 'microregion': value = unicode(row[key].decode('utf-8')) row_obj[key] = value row_obj['region2'] = value elif key == 'terms_of_sale': value = unicode(row[key].decode('utf-8')) if u'аренда' in value: row_obj['action_type'] = 'rent' elif u'обмен' in value: row_obj['action_type'] = 'exchange' else: row_obj['action_type'] = 'sale' elif key == 'lift': if row[key] is not None and row[key] != '': row_obj[key] = True else: row_obj[key] = False elif key in ['year_built', 'floor', 'number_of_floors']: if row[key] is not None and row[key] != '': try: row_obj[key] = int(row[key]) except ValueError: continue elif key in [ 'kitchen_area', 'total_area', 'living_area', 'price' ]: if row[key] is not None and row[key] != '': try: row_obj[key] = float(row[key].replace( ',', '.')) except ValueError: continue elif 'image' in key: if row_obj.get('images') is None: row_obj['images'] = { 'medium': [], 'thumbs': [], 'original': [] } row_obj['images_len'] = 0 if isinstance(row[key], list): for image in row[key]: if image != '': images_arr = image.split(',') for img in images_arr: row_obj['images']['medium'].append(img) row_obj['images']['thumbs'].append(img) row_obj['images']['original'].append( img) row_obj['images_len'] += 1 else: if row[key] != '': images_arr = row[key].split(',') for img in images_arr: row_obj['images']['medium'].append(img) row_obj['images']['thumbs'].append(img) row_obj['images']['original'].append(img) row_obj['images_len'] += 1 elif 'description' in key: if row_obj.get('description') is None: row_obj['description'] = '' row_obj['description'] += row[key] elif key in ['region', 'city']: if key == 'city': value = unicode(row[key].decode('utf-8')) for city in CITIES: if city in value: row_obj['region'] = value break if row_obj.get('region') is None: row_obj['region'] = unicode( row['region'].decode( 'utf-8')) + u' область' row_obj[key] = u'г. ' + value # elif key == 'house': # if row_obj.get('address') is not None: # row_obj['address'] += u' ' + unicode(row[key].decode('utf-8')) # else: # row_obj[key] = row[key] # elif key == 'address': # if row_obj.get(key) is None: # row_obj[key] = unicode(row[key].decode('utf-8')) # row_obj[key] += u' ' + unicode(row['house'].decode('utf-8')) row_obj['currency'] = 'usd' row_obj['current_status'] = 'vip_normal' row_obj['group'] = 'living' row_obj['cat_tab'] = 'flat' row_obj['cat_type'] = 'flat' adv = Advert(**row_obj) adv.save(write_concern={'w': 0, 'j': False, 'wtimeout': 0}) # a = Advert.objects(price__exists=True, images_len__gt=1, # region__icontains=u'Гомель', # action_type__contains='sale', # group='living', cat_type_in='flat') a = Advert.objects(price__exists=True, images_len__gt=1, region__icontains=u'Гомель', action_type__contains='sale', group='living', cat_type__in=['flat']) adv_im = [] for adv in a: images = adv.get('images') if images is not None and len(images) > 1: adv_im.append(adv) print a