def get_image(price): price_argo = read_csv(price) list_image_dir = [] i = 1 for stuff in price_argo: html = get_html(stuff['url_stuff']) soup = BeautifulSoup(html, 'lxml') try: image_list = soup.find('div', id='product-gallery').find_all('a') except: image_list = soup.find('div', class_='image-border').find_all('a') image_list_url = [] for image in image_list: url_image = image.get('href') if url_image in image_list_url: break image_list_url.append(url_image) dir_for_image = os.path.join( r'D:\tmp\python\python_parsing\parsing_ARGO', 'image', stuff['category_name'], stuff['name_stuff']).replace(' ', '_').replace('"', '') while dir_for_image in list_image_dir: dir_for_image = dir_for_image + '_' + str(i) i += 1 os.makedirs(dir_for_image) list_image_dir.append(dir_for_image) for i, image in enumerate(image_list_url): if image == '': continue with open(os.path.join(dir_for_image, '{}.jpg'.format(i)), 'wb') as image_file: image_file.write(requests.get(image).content) write_csv(stuff, dir_for_image + '\info.csv')
def GetFilmCatalog(ListData, num): NameFile = r'D:\tmp\my_prod\Python\python\ParsingMobi711\MobiParsBk_mp.csv' for Data in ListData: html = get_html('https://mobi711.ru' + Data['Url']) soup = BeautifulSoup(html, 'lxml') try: DivUrlFilmCatalog = soup.find_all('div', class_='category no-description') except AttributeError: DivUrlFilmCatalog = [] if DivUrlFilmCatalog != []: for Div in DivUrlFilmCatalog: UrlFilmCatalog = 'https://mobi711.ru' + \ Div.find('div', class_='text').find('a').get('href') Name = Div.find('div', class_='text').find('a').text if Name == 'Защитные пленки и стекла': break else: UrlFilmCatalog = '' Name = '' else: UrlFilmCatalog = '' data = { 'Brand': Data['Brand'], 'Model': Data['Model'], 'UrlModel': Data['Url'], 'UrlFilmCatalog': UrlFilmCatalog } write_csv(data, NameFile)
def get_stuff_info(list_stuff): for stuff in list_stuff: url_stuff = stuff['url_stuff'] html_stuff = get_html(url_stuff) soup = BeautifulSoup(html_stuff, 'lxml') text_info = soup.find('div', id='test').find_all('li') size, color, cloth = '', '', '' price = '' try: price = soup.find('div', class_='retail-price price-prod').text except: price = '' for text_ in text_info: cur_line_name = text_.find('span', class_='opts-lab').text cur_line_val = text_.find('span', class_='opts-val').text if cur_line_name == 'Размер:': size = cur_line_val elif cur_line_name == 'Цвет:': color = cur_line_val elif cur_line_name == 'Ткань:': cloth = cur_line_val data_stuff = { 'category_name': stuff['category_name'], 'name_stuff': stuff['name_stuff'], 'url_stuff': stuff['url_stuff'], 'size': size, 'color': color, 'cloth': cloth, 'price': price } print(data_stuff) write_csv(data_stuff, r'D:\tmp\my_prod\Python\python\parsing_ARGO\argo.csv')
def get_information(): data = { 'name_stuff': name_stuff, 'url_stuff': stuff_url, 'size': size, 'color': color, 'cloth': cloth } driver_new.quit() print(data) write_csv(data, r'D:\tmp\my_prod\Python\python\parsing_ARGO\argo.csv')
def main(): path = os.path.abspath(input('Введите путь к папке: ')) if path == '': path = os.curdir FileName = input('Введите путь и название файла с результатами: ') if FileName == '': FileName = 'ListFile.csv' data = [] data = (os.listdir(path)) for line in data: Data = {'Имя файла': line} write_csv(Data, FileName)
def GetData(brand, model, img, mainUrl): dataCsv = {'Brand': brand['nameBrand'], 'Model': model['nameModel'], 'Img Directory': brand['nameBrand']+'\\'+model['nameModel']+'\\'+img['nameImg']+'.jpg' if img['nameImg'] != None else 'None'} if img['nameImg'] != None: pathImg = os.path.join( 'tablets', brand['nameBrand'], model['nameModel']) os.makedirs(pathImg) with open(os.path.join(pathImg, img['nameImg']+'.jpg'), 'wb') as imageFile: imageFile.write(requests.get(mainUrl+img['urlImg']).content) imageFile.close() write_csv(dataCsv, 'Tablet.csv')
def get_stuff_info(list_stuff_on_site): for stuff in list_stuff_on_site: html = get_html(stuff['stuff_url']) with open('0.html', 'w', encoding='utf-8') as f: f.write(html) f.close() soup = BeautifulSoup(html, 'lxml') try: descr = soup.find('div', id='tab-description').text except AttributeError: descr = '' try: atr = soup.find( 'div', id='tab-specification').find('div', class_='attribute').text except AttributeError: atr = '' try: size_span = soup.find('div', class_='option row').find( 'tbody').find_all('span', class_='size-title') except AttributeError: size_span = [] size_span = soup.find('div', class_='option row').find( 'tbody').find_all('span', class_='size-title') size = [] for size_tmp in size_span: size.append(size_tmp.text) ','.join(size) data = {'catalog_name': stuff['catalog_name'], 'stuff_name': stuff['stuff_name'], 'stuff_url': stuff['stuff_url'], 'descr': descr.replace('\n', ' ').replace('\r', ' '), 'size': size, 'atr': atr.replace('\n', ' ').replace('\r', ' ')} main_dir = r'D:\tmp\python\python_parsing\parsing_ck_textil' write_csv( data, r'D:\tmp\python\python_parsing\parsing_ck_textil\ck_textil.csv') try: image_tag_a_list = soup.find( 'div', class_='MagicToolboxSelectorsContainer').find_all('a') except AttributeError: image_tag_a_list = soup.find( 'div', class_='MagicToolboxContainer selectorsBottom minWidth').find_all('a') image_dir = os.path.join( main_dir, stuff['catalog_name'], stuff['stuff_name']).replace(' ', '_') os.makedirs(image_dir) for i, image_tmp in enumerate(image_tag_a_list): image_tmp.get('href') image_url = 'https://' + image_tmp.get('href')[2:] with open(os.path.join(image_dir, '{}.jpg'.format(i)), 'wb') as file: file.write(requests.get(image_url).content) write_csv(data, os.path.join(image_dir, 'info.csv'))
def get_stuff_on_page(page_catalog_url): list_stuff_on_page = [] html = get_html(page_catalog_url) soup = BeautifulSoup(html, 'lxml') list_stuff = soup.find_all('div', class_='product-thumb transition') catalog_name = soup.find('h1').text for stuff in list_stuff: url = stuff.find('div', class_='caption').find('a').get('href') stuff_name = stuff.find( 'div', class_='caption').find('a').text.replace('"', '') data = {'catalog_name': catalog_name, 'stuff_name': stuff_name, 'stuff_url': url} list_stuff_on_page.append(data) write_csv(data, 'ck.csv') return list_stuff_on_page
def GetListBrandAndModels(mainUrl): html = get_html(mainUrl) soup = BeautifulSoup(html, 'lxml') ListBrandDiv = soup.find_all('div', class_='category-wrap') for Div in ListBrandDiv: BrandName = Div.find('div', class_='text').find('a').text try: ListModelsA = Div.find('div', class_='sub').find_all('a') except AttributeError: ListModelsA = '' for A in ListModelsA: Text = A.text if Text != 'Показать еще': ModelName = Text Url = A.get('href') data = {'Brand': BrandName, 'Model': ModelName, 'Url': Url} write_csv(data, 'Mobi711.csv')
def get_all_stuff(list_stuff): list_dir = [] for stuff in list_stuff: html = get_html(stuff['stuff_url']) soup = BeautifulSoup(html, 'lxml') with open('0.html', 'w', encoding='utf-8') as file: file.write(html) file.close() try: descr = soup.find( 'div', class_='block-text block-type-catalogitem-text textcontent' ).find('p').text except: descr = '' try: url_image = soup.find('div', class_='block-picture').find('a').get('href') except: continue catalog_name = stuff['catalog_name'] stuff_name = stuff['stuff_name'].replace('"', '') stuff_url = stuff['stuff_url'] data = { 'catalog_name': catalog_name, 'stuff_name': stuff_name, 'stuff_url': stuff_url, 'stuff_descr': descr } main_dir = r'D:\tmp\python\python_parsing\parsing_formateks' new_dir = os.path.join(main_dir, catalog_name, stuff_name).replace(' ', '_').lower() if new_dir in list_dir: new_dir = new_dir + '_1' else: list_dir.append(new_dir) os.makedirs(new_dir) write_csv(data, os.path.join(new_dir, 'info.csv')) write_csv(data, os.path.join(main_dir, 'formarket.csv')) with open(os.path.join(new_dir, '0.jpg'), 'wb') as file: file.write(requests.get(url_image).content) file.close()
def get_stuff_on_page(url_catalog): for item in url_catalog: for page in range(item['max_page']): if page == 0: html = get_html(item['url']) else: html = get_html(item['url'] + '/page-{}'.format(page + 1)) soup = BeautifulSoup(html, 'lxml') with open('0.html', 'w', encoding='utf-8') as f: f.write(html) f.close() stuff_div = soup.find( 'div', class_='catalog-collection cleared').find_all('h3') for stuff in stuff_div: stuff_url = 'http://formateks.ru' + stuff.find('a').get('href') stuff_name = stuff.find('a').text data = { 'catalog_name': item['catalog_name'], 'stuff_name': stuff_name, 'stuff_url': stuff_url } write_csv(data, 'formarket.csv')
import os from my_lib import write_csv file_list = os.listdir(r'D:\Done\Antyspy') for file in file_list: data = {'name': file.replace('.jpg', '')} write_csv(data, "tmp.csv")