def parsing(self, html, obj, win_size_w, win_size_h): global i_count global downl_list soup = BS(html, features="html.parser") r = soup.find('div', id='dle-content').find_all('a', class_='screen-link') for inc, a in enumerate(r, start=1): soup = BS(get_html(a.get('href'), self.E_SITE), features="html.parser") req = soup.find('div', class_='llink').find_all('a') for aa in req: size_w = int(aa.get('href').split('/')[-2].split('x')[0]) size_h = int(aa.get('href').split('/')[-2].split('x')[1]) if win_size_w == size_w and win_size_h == size_h: url_size = aa.get('href') self.downl_list.append(url_size) else: continue random_el = random.choice(self.downl_list) soup2 = BS(get_html(random_el, self.E_SITE), features="html.parser") image_url = soup2.find(id='img').get('src') name = f'img/{obj}' + '.jpg' save_file(image_url, name) print(f'|{name.split("/")[-1]:^{33}}|{"Загружен":^{10}}|') self.i_count += 1 return self.i_count
def ParsPagesUrl(MainUrl): html = get_html(MainUrl) soup = BeautifulSoup(html, 'lxml') ListCharPageTag = [] ListCharPageUrl = [] ListCharPageUrlWithPage = [] ListCharPageTag = soup.find('div', class_='first-level').find_all('a') # print(ListCharPageTag) for CharPageTag in ListCharPageTag: ChatUrl = MainUrl + CharPageTag.get('href') ListCharPageUrl.append(ChatUrl) # print(ListCharPageUrl) for CharPageUrl in ListCharPageUrl: html = get_html(CharPageUrl) soup = BeautifulSoup(html, 'lxml') try: Pagination = int( soup.find('ul', class_='pagination').find_all('a')[-1].text) except: Pagination = 1 num = 1 while num <= Pagination: Url = CharPageUrl[0:-1] + str(num) ListCharPageUrlWithPage.append(Url) num += 1 return ListCharPageUrlWithPage
def main(ob): obj, os = ob url = f'http://soft.sibnet.ru/search/?text={obj.strip()}&os={os.strip()}&&pg=1' pages = get_pages(get_html(url, 'cp1251')) for i in range(1, 2): base_url = f'http://soft.sibnet.ru/search/?text={obj.strip()}&os={os.strip()}&&pg={i}' parsing(get_html(base_url, 'cp1251'), obj)
def get_image(price): price_argo = read_csv(price) list_image_dir = [] i = 1 for stuff in price_argo: html = get_html(stuff['url_stuff']) soup = BeautifulSoup(html, 'lxml') try: image_list = soup.find('div', id='product-gallery').find_all('a') except: image_list = soup.find('div', class_='image-border').find_all('a') image_list_url = [] for image in image_list: url_image = image.get('href') if url_image in image_list_url: break image_list_url.append(url_image) dir_for_image = os.path.join( r'D:\tmp\python\python_parsing\parsing_ARGO', 'image', stuff['category_name'], stuff['name_stuff']).replace(' ', '_').replace('"', '') while dir_for_image in list_image_dir: dir_for_image = dir_for_image + '_' + str(i) i += 1 os.makedirs(dir_for_image) list_image_dir.append(dir_for_image) for i, image in enumerate(image_list_url): if image == '': continue with open(os.path.join(dir_for_image, '{}.jpg'.format(i)), 'wb') as image_file: image_file.write(requests.get(image).content) write_csv(stuff, dir_for_image + '\info.csv')
def GetFilmCatalog(ListData, num): NameFile = r'D:\tmp\my_prod\Python\python\ParsingMobi711\MobiParsBk_mp.csv' for Data in ListData: html = get_html('https://mobi711.ru' + Data['Url']) soup = BeautifulSoup(html, 'lxml') try: DivUrlFilmCatalog = soup.find_all('div', class_='category no-description') except AttributeError: DivUrlFilmCatalog = [] if DivUrlFilmCatalog != []: for Div in DivUrlFilmCatalog: UrlFilmCatalog = 'https://mobi711.ru' + \ Div.find('div', class_='text').find('a').get('href') Name = Div.find('div', class_='text').find('a').text if Name == 'Защитные пленки и стекла': break else: UrlFilmCatalog = '' Name = '' else: UrlFilmCatalog = '' data = { 'Brand': Data['Brand'], 'Model': Data['Model'], 'UrlModel': Data['Url'], 'UrlFilmCatalog': UrlFilmCatalog } write_csv(data, NameFile)
def get_stuff_info(list_stuff): for stuff in list_stuff: url_stuff = stuff['url_stuff'] html_stuff = get_html(url_stuff) soup = BeautifulSoup(html_stuff, 'lxml') text_info = soup.find('div', id='test').find_all('li') size, color, cloth = '', '', '' price = '' try: price = soup.find('div', class_='retail-price price-prod').text except: price = '' for text_ in text_info: cur_line_name = text_.find('span', class_='opts-lab').text cur_line_val = text_.find('span', class_='opts-val').text if cur_line_name == 'Размер:': size = cur_line_val elif cur_line_name == 'Цвет:': color = cur_line_val elif cur_line_name == 'Ткань:': cloth = cur_line_val data_stuff = { 'category_name': stuff['category_name'], 'name_stuff': stuff['name_stuff'], 'url_stuff': stuff['url_stuff'], 'size': size, 'color': color, 'cloth': cloth, 'price': price } print(data_stuff) write_csv(data_stuff, r'D:\tmp\my_prod\Python\python\parsing_ARGO\argo.csv')
def get_list_catalog_url(url_in): html = get_html(url_in) soup = BeautifulSoup(html, 'lxml') list_catalog_url = soup.find_all( 'div', class_='col-md-4 col-sm-6 col-xs-12 redisign-category item') for j, url in enumerate(list_catalog_url): list_catalog_url[j] = url_in + url.find('a').get('href') return list_catalog_url
def tests_get_html(self): """ Why do we need this ? For checking of connection and is site online. """ expected_value = '<form name="login"' actual_value = get_html() # Test 1 get html self.assertIn(expected_value, actual_value)
def get_list_stuff_on_pages(list_catalog_url): list_stuff = [] for q, catalog_url in enumerate(list_catalog_url): html_catalog = get_html(catalog_url) soup = BeautifulSoup(html_catalog, 'lxml') text_about_page = soup.find( 'div', class_='col-lg-6 col-xs-12 text-right results').text.split(' ') for i, text in enumerate(text_about_page): if text == '(всего': max_page = int(text_about_page[i + 1]) break list_stuff_div = soup.find_all( 'div', class_='product-list-item xs-100 sm-100 md-100 lg-100 xl-100') category_name = soup.find('h1').text for stuff in list_stuff_div: url_stuff = stuff.find('h4').find('a').get('href') name_stuff = stuff.find('h4').find('a').text data_tmp = { 'category_name': category_name, 'name_stuff': name_stuff, 'url_stuff': url_stuff } list_stuff.append(data_tmp) for page in range(1, max_page): url_page = catalog_url + '?page={}'.format(page + 1) html_page = get_html(url_page) soup = BeautifulSoup(html_page, 'lxml') list_stuff_div = soup.find_all( 'div', class_='product-list-item xs-100 sm-100 md-100 lg-100 xl-100') for stuff in list_stuff_div: url_stuff = stuff.find('h4').find('a').get('href') name_stuff = stuff.find('h4').find('a').text data_tmp = { 'category_name': category_name, 'name_stuff': name_stuff, 'url_stuff': url_stuff } list_stuff.append(data_tmp) return list_stuff
def tests_get_token_from_html(self): """ You could test this by checking if the token value is not null e.g assertIsNotNone(token_value) """ expected_value = 'maybe not even needed because token is variable' actual_value = 'token value' # Test 2 get token from html html = get_html(uri=self.uri) actual_value = get_token(html) self.assertIsNotNone(actual_value)
def get_catalog_url(url_in): html = get_html(url_in) soup = BeautifulSoup(html, 'lxml') url_list_ul = soup.find('ul', class_='nav navbar-nav').find_all('li') catalog = [] for url in url_list_ul: catalog_name = url.find('a').text catalog_url = url.find('a').get('href') catalog_dict = {'catalog_name': catalog_name, 'catalog_url': catalog_url} catalog.append(catalog_dict) return catalog
def get_max_page(url_page='', html=''): if not url_page == '': html_new = get_html(url_page) elif not html == '': html_new = html soup = BeautifulSoup(html_new, 'lxml') pagination = soup.find('div', class_='pagination_wrap row').text.split(' ') for i, page in enumerate(pagination): if page == '(всего': max_page = int(pagination[i+1]) break return max_page
def ParsWordFromPage(url_in): html = get_html(url_in) soup = BeautifulSoup(html, 'lxml') ListWordTag = [] ListWordTag = soup.find('table', class_='table').find_all('td', class_='text-left') for WordTag in ListWordTag: Word = WordTag.find('a').text with open('python\\FindWord\\DictWord', 'a', encoding='utf-8') as Dict_: Dict_.write(Word + '; ') Dict_.close()
def get_catalog(utl_in): catalog = [] html = get_html(url_in) soup = BeautifulSoup(html, 'lxml') catalog_div = sopu.find('div', class_='column-aside', id='aside').find( 'ul', class_='menu').find_all('li', class_='menu__item') for catalog_tmp in catalog_div: catalog_url = catalog_tmp.find('a').get('htef') catalog_name = catalog_tmp.find('a').text data = {'catalog_name': catalog_name, 'catalog_url': catalog_url} catalog.append(data) return (catalog)
def tests_file_type(self): """ Example of a test """ html = get_html(uri=self.uri) self.login_data['tcurl'] = get_token(html) opener = my_opener(login_data=self.login_data) response = opener.open(self.test_image_uri) expected_value = self.file_type actual_value = response.info().get('Content-Type') self.assertIn(expected_value, actual_value)
def main(self, obj): start_time = datetime.now() win_width = 1920 win_heigth = 1080 url = f'https://www.nastol.com.ua/tags/{quote(obj.strip(), encoding=self.E_SITE)}/page/1/' pages, img_count = self.get_pages(get_html(url, self.E_SITE)) page = random.randint(1, pages) base_url = f'https://www.nastol.com.ua/tags/{quote(obj.strip(), encoding=self.E_SITE)}/page/{page}/' print(f'{img_count}\nСтраница:{page}-{pages}') print('-' * 51) print(f'|{"Категория - имя файла":{33}}|{"Статус":{10}}|') print('-' * 51) inc = self.parsing(get_html(base_url, self.E_SITE), obj, win_width, win_heigth) print('-' * 51, end='\n') print(f'Скачано:{inc}') end_time = datetime.now() print( f'Затрачено времени:{str(end_time - start_time).split(".")[0]:^{50}}' ) print('-' * 51, end='\n')
def get_stuff_info(list_stuff_on_site): for stuff in list_stuff_on_site: html = get_html(stuff['stuff_url']) with open('0.html', 'w', encoding='utf-8') as f: f.write(html) f.close() soup = BeautifulSoup(html, 'lxml') try: descr = soup.find('div', id='tab-description').text except AttributeError: descr = '' try: atr = soup.find( 'div', id='tab-specification').find('div', class_='attribute').text except AttributeError: atr = '' try: size_span = soup.find('div', class_='option row').find( 'tbody').find_all('span', class_='size-title') except AttributeError: size_span = [] size_span = soup.find('div', class_='option row').find( 'tbody').find_all('span', class_='size-title') size = [] for size_tmp in size_span: size.append(size_tmp.text) ','.join(size) data = {'catalog_name': stuff['catalog_name'], 'stuff_name': stuff['stuff_name'], 'stuff_url': stuff['stuff_url'], 'descr': descr.replace('\n', ' ').replace('\r', ' '), 'size': size, 'atr': atr.replace('\n', ' ').replace('\r', ' ')} main_dir = r'D:\tmp\python\python_parsing\parsing_ck_textil' write_csv( data, r'D:\tmp\python\python_parsing\parsing_ck_textil\ck_textil.csv') try: image_tag_a_list = soup.find( 'div', class_='MagicToolboxSelectorsContainer').find_all('a') except AttributeError: image_tag_a_list = soup.find( 'div', class_='MagicToolboxContainer selectorsBottom minWidth').find_all('a') image_dir = os.path.join( main_dir, stuff['catalog_name'], stuff['stuff_name']).replace(' ', '_') os.makedirs(image_dir) for i, image_tmp in enumerate(image_tag_a_list): image_tmp.get('href') image_url = 'https://' + image_tmp.get('href')[2:] with open(os.path.join(image_dir, '{}.jpg'.format(i)), 'wb') as file: file.write(requests.get(image_url).content) write_csv(data, os.path.join(image_dir, 'info.csv'))
def get_stuff_on_page(url_catalog): for item in url_catalog: for page in range(item['max_page']): if page == 0: html = get_html(item['url']) else: html = get_html(item['url'] + '/page-{}'.format(page + 1)) soup = BeautifulSoup(html, 'lxml') with open('0.html', 'w', encoding='utf-8') as f: f.write(html) f.close() stuff_div = soup.find( 'div', class_='catalog-collection cleared').find_all('h3') for stuff in stuff_div: stuff_url = 'http://formateks.ru' + stuff.find('a').get('href') stuff_name = stuff.find('a').text data = { 'catalog_name': item['catalog_name'], 'stuff_name': stuff_name, 'stuff_url': stuff_url } write_csv(data, 'formarket.csv')
def ListModel(mainUrl, urlBrand): html = get_html(urlBrand) soup = BeautifulSoup(html, 'lxml') modelSpan = soup.find_all('span', class_='brandphonename') listModetTablet = [] for span in modelSpan: nameModel = span.find('a').text char = ['\"', '\\', '/', '\'', ':', '|'] for c in char: nameModel = nameModel.replace(c, '') urlModel = span.find('a').get('href') modelDict = {'nameModel': nameModel, 'urlModel': urlModel} listModetTablet.append(modelDict) return listModetTablet
def get_stuff_on_page(page_catalog_url): list_stuff_on_page = [] html = get_html(page_catalog_url) soup = BeautifulSoup(html, 'lxml') list_stuff = soup.find_all('div', class_='product-thumb transition') catalog_name = soup.find('h1').text for stuff in list_stuff: url = stuff.find('div', class_='caption').find('a').get('href') stuff_name = stuff.find( 'div', class_='caption').find('a').text.replace('"', '') data = {'catalog_name': catalog_name, 'stuff_name': stuff_name, 'stuff_url': url} list_stuff_on_page.append(data) write_csv(data, 'ck.csv') return list_stuff_on_page
def GetImg(urlModel): html = get_html(urlModel) soup = BeautifulSoup(html, 'lxml') try: urlImg = soup.find('img', class_='b-devPic__picNew').get('src') nameImg = soup.find('img', class_='b-devPic__picNew').get('alt') char = ['\"', '\\', '/', '\'', ':', '|'] for c in char: nameImg = nameImg.replace(c, '') except: urlImg = None nameImg = None imgDict = {'urlImg': urlImg, 'nameImg': nameImg} return imgDict
def ListBrand(urlIn): html = get_html(urlIn) soup = BeautifulSoup(html, 'lxml') brandDiv = soup.find_all( 'div', class_='b-listli b-listli_big b-listli_u p-listul__listli') listBrandTablet = [] for brand in brandDiv: nameBrand = brand.find('a').text char = ['\"', '\\', '/', '\'', ':', '|'] for c in char: nameBrand = nameBrand.replace(c, '') urlBrand = brand.find('a').get('href') brandDict = {'nameBrand': nameBrand, 'urlBrand': urlBrand} listBrandTablet.append(brandDict) return listBrandTablet
def tests_access_picture_max_size(self): """ I would say if the status_code is 200 and file_type match a certain type it a success e.g tuple (200, 'image/png') Don't use print statement unless you test something """ # Test 3 are we have access to an picture of max size? html = get_html(uri=self.uri) self.login_data['tcurl'] = get_token(html) opener = my_opener(login_data=self.login_data) response = opener.open(self.test_image_uri) expected_value = self.file_type actual_value = response.info().get('Content-Type') self.assertIn(expected_value, actual_value)
def GetListBrandAndModels(mainUrl): html = get_html(mainUrl) soup = BeautifulSoup(html, 'lxml') ListBrandDiv = soup.find_all('div', class_='category-wrap') for Div in ListBrandDiv: BrandName = Div.find('div', class_='text').find('a').text try: ListModelsA = Div.find('div', class_='sub').find_all('a') except AttributeError: ListModelsA = '' for A in ListModelsA: Text = A.text if Text != 'Показать еще': ModelName = Text Url = A.get('href') data = {'Brand': BrandName, 'Model': ModelName, 'Url': Url} write_csv(data, 'Mobi711.csv')
def get_all_stuff(list_stuff): list_dir = [] for stuff in list_stuff: html = get_html(stuff['stuff_url']) soup = BeautifulSoup(html, 'lxml') with open('0.html', 'w', encoding='utf-8') as file: file.write(html) file.close() try: descr = soup.find( 'div', class_='block-text block-type-catalogitem-text textcontent' ).find('p').text except: descr = '' try: url_image = soup.find('div', class_='block-picture').find('a').get('href') except: continue catalog_name = stuff['catalog_name'] stuff_name = stuff['stuff_name'].replace('"', '') stuff_url = stuff['stuff_url'] data = { 'catalog_name': catalog_name, 'stuff_name': stuff_name, 'stuff_url': stuff_url, 'stuff_descr': descr } main_dir = r'D:\tmp\python\python_parsing\parsing_formateks' new_dir = os.path.join(main_dir, catalog_name, stuff_name).replace(' ', '_').lower() if new_dir in list_dir: new_dir = new_dir + '_1' else: list_dir.append(new_dir) os.makedirs(new_dir) write_csv(data, os.path.join(new_dir, 'info.csv')) write_csv(data, os.path.join(main_dir, 'formarket.csv')) with open(os.path.join(new_dir, '0.jpg'), 'wb') as file: file.write(requests.get(url_image).content) file.close()