def handle(self, *args, **options): print('Clearing DB') Category.objects.all().delete() Product.objects.all().delete() print('Start importing from excel %s' % DATA_DIR) wb = load_workbook(DATA_DIR + '/price.xlsx') sheet = wb.get_sheet_by_name(wb.get_sheet_names()[0]) cat = None for cnt in range(1, sheet.max_row, 1): item = sheet.cell(row=cnt, column=3).value id = sheet.cell(row=cnt, column=2).value if id == None: print('Create a new category') cat = Category() cat.name = item cat.save() else: print('Create a new good') if cat: p = Product() p.name = item p.category = cat p.save()
def handle(self, *args, **options): print('Clearing DB ...') # удаляем записи и картинки Category.objects.all().delete() SubCategory.objects.all().delete() try: shutil.rmtree('%s/media' % BASE_DIR) except FileNotFoundError: pass # парсим главную страницу base_url = 'https://west-info.biz/katalog-predpriyatij/' print(f'Start import from {base_url}') res = requests.get(base_url) soup = bs(res.text, 'html.parser') # находим нужный контент content = soup.find('ul', {'class': 'submenu'}) for item in content.findAll('li', {'class': 'submenu_item'}): c = Category() c.name = item.find('a').text c.save() print(f'Import {c.name}') subcategories = item.findAll('a', {'class': 'sub2menu_link'}) for k in subcategories: sub = SubCategory() sub.name = k.text sub.category = c sub.save() print(f'Import {sub.name}')
def handle(self, *args, **options): print('Clearing DB') # удаляем записи и картинки Category.objects.all().delete() Product.objects.all().delete() try: shutil.rmtree('%s/media/product' % BASE_DIR) except FileNotFoundError: pass # достаем главную страницу и парсим URL = 'https://tainabox.com.ua' print('Start importing from %s' % URL) rez = requests.get(URL) # rez = requests.get(URL, verify=False) soup = BeautifulSoup(rez.text, 'html.parser') # находим нужный див и в нем картинки content = soup.find('div', {'id': 'header-custom-middle-block'}) for menu in content.findAll('ul', {'class': 'header__mid-menu'}): for category in menu.findAll('span', {'class': 'hlink'}): time.sleep(random.randint(1, 5)) c = Category() c.name = category.text c.save() link = base64.b64decode( category.attrs['data-href']).decode("utf-8") get_products(link, c) for category in menu.findAll('a'): print(category.text) c = Category() c.name = category.text c.save() get_products(category['href'], c)
def handle(self, *args, **options): print('Clearing DB') # удаляем записи и картинки Category.objects.all().delete() SubCategory.objects.all().delete() Product.objects.all().delete() shutil.rmtree('%s/media' % BASE_DIR) # достаем главную страницу и парсим URL = 'https://gastronoma.net' print('Start importing from %s' % URL) rez = requests.get(URL, verify=False) soup = BeautifulSoup(rez.text, 'html.parser') # находим нужный див и в нем картинки content = soup.find('div',{'class': 'body_20'}) for img in content.findAll('img'): c = Category() c.name = img.get('alt') img_url = 'https://gastronoma.net/%s' % img.get('src') img_response = requests.get(img_url, stream=True, verify=False) # сохраняем временный файл with open('tmp.png', 'wb') as out_file: shutil.copyfileobj(img_response.raw, out_file) # читаем временный файл и загружаем его программно в модель with open('%s/tmp.png' % BASE_DIR, 'rb') as img_file: c.image.save('cat.png', File(img_file), save=True) c.save() # забираем подкатегории for subcat in img.find_parent('tr').find('div').findAll('a'): sc = SubCategory() sc.category = c sc.name = subcat.text sc.save() get_products(c,sc,subcat.get('href'))
def handle(self, *args, **options): print('Clearing DB') # удаляем записи и картинки Category.objects.all().delete() SubCategory.objects.all().delete() Product.objects.all().delete() try: shutil.rmtree('%s/media/category' % BASE_DIR) # очищает католог от картинок except: pass # достаем главную страницу и парсим URL = 'https://gastronoma.net' print('Start importing from %s' % URL) requests.packages.urllib3.disable_warnings( category=InsecureRequestWarning) # отключаем вывод предупреждения # о необходимости проверять сайт (varify=False) rez = requests.get(URL, verify=False) soup = BeautifulSoup(rez.text, 'html.parser') # находим нужный div и в нем картинки content = soup.find('div', {'class': 'body_20'}) for img in content.findAll('img'): c = Category() c.name = img.get('alt') img_url = 'https://gastronoma.net/%s' % img.get('src') requests.packages.urllib3.disable_warnings( category=InsecureRequestWarning) img_response = requests.get(img_url, stream=True, verify=False) # сохраняем временный файл with open( 'tmp.png', 'wb' ) as out_file: # создается временный файл 'b' - binary, для записи 'w'-write shutil.copyfileobj(img_response.raw, out_file) # читаем временный файл и загружаем его программно в модель with open('%s/tmp.png' % BASE_DIR, 'rb') as img_file: c.image.save('cat.png', File(img_file), save=True) c.save() # забираем подкатегории for subcat in img.find_parent('tr').find('div').findAll('a'): sc = SubCategory() sc.name = subcat.text sc.category = c sc.save() get_products(c, sc, subcat.get('href')) print('Saving... %s' % c.name)
def handle(self, *args, **options): print('Clear DB') Category.objects.all().delete() Product.objects.all().delete() print('Start import from excel %s' % DATA_DIR) wb = load_workbook((DATA_DIR + '\\price.xlsx')) worksheet = wb.get_sheet_by_name(wb.get_sheet_names()[0]) category = None for cnt in range(1, worksheet.max_row + 1): item = worksheet.cell(row=cnt, column=5).value cat = worksheet.cell(row=cnt, column=1).value if item == None: category = Category() category.name = cat category.save() print('Create category') else: if category: product = Product() product.name = item product.category = category product.save() print('Create item')
def handle(self, *args, **options): print('Clearing DB ...') # удаляем записи и картинки Category.objects.all().delete() SubCategory.objects.all().delete() Company.objects.all().delete() try: shutil.rmtree('%s/media' % BASE_DIR) except FileNotFoundError: pass # парсим главную страницу base_url = 'https://west-info.biz/katalog-predpriyatij/' print(f'Start import from {base_url}') res = requests.get(base_url) soup = BeautifulSoup(rez.text, 'html.parser') # находим нужный контент categories = soup.findAll('li', {'class': 'submenu_item'}) for it in categories[:5]: c = Category() c.name = it.find('a').text c.save() print(f'Import {c.name}') subcategories = it.findAll('a', {'class': 'sub2menu_link'}) for kat in subcategories: sub = SubCategory() sub.name = kat.text sub.category = c sub.save() print(f'Import {sub.name}') new_url = f"https://west-info.biz/katalog-predpriyatij{k['href']}" catalog = requests.get(new_url) new_soup = bs(catalog.text, 'html.parser') div = new_soup.findAll('div', {'class': 'teaser-item'}) for item in div: firm_name = item.find('h2', {'class': 'pos-title'}) firm_description = item.find('p') firm_city = item.find('div', {'class': 'element element-text'}) firm_adress = item.findAll('div', {'class': 'element element-text'}) firm_phones = item.find('div', {'class': 'element element-text last'}) a = item.findAll('img') for link in a: print(link['src']) if firm_phones: phones_list = firm_phones.text.replace(' ', '').replace(',', ' ').replace(';', ' ').split() com = Company() if firm_name: com.name = firm_name.text if firm_description: com.description = firm_description.text if firm_city: com.city = firm_city.text if firm_adress and len(firm_adress) >= 2: com.adress = firm_adress[1].text else: com.adress = '-' for z in phones_list: com.phone = p com.category = c com.sub_category = sub com.save() print(f'{com.name} save...')