def scrap(url, writer, index): temp = [] try: img_url = scrapping_tools.get_img_url(product_url) except: print("cannot find image url: index-{}, product_url: {}".format(index, product_url)) img_url = 'null' print('get_img_url') try: product_description = scrapping_tools.get_product_description(product_url) except: print("cannot find description: index-{}, product_url: {}".format(index, product_url)) product_description=['null','null','null'] print('get_product_description') try: comb_url = scrapping_tools.get_comb_url(product_url) except: print("cannot find combinition url: index-{}, product_url: {}".format(index, product_url)) comb_url = [] print('get comb url') temp.append(index) temp.append(product_url) temp.append(product_description[0]) temp.append(gender) temp.append(cat) temp.append(img_url) temp.append(product_description[1]) temp.append(product_description[2]) for comb in comb_url: temp.append(comb) print("appending") writer.writerow(temp) print('write')
def scrap_by_gender(args, gender): csv_path = os.path.join(args.csv_dir, gender + "_clothes_comb.csv") with open(csv_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',') index = 0 category_index = 0 for cat in category[gender]: category_index += 1 product_index = 0 print(gender + " category: {}/{}".format(category_index, len(men_category))) for i in range(1, 100): cat_url = root + '/en-us/' + gender + '/' + cat if i != 1: cat_url += "?page={}".format(i) try: product_urls = scrapping_tools.get_product_url(cat_url) except: print("end of category: page{}".format(i + 1)) break for product_url in product_urls: print(product_url) product_index += 1 index += 1 if (product_index > 1000): break if (product_index % 10 == 0): print(gender + "/{}: page{}, {}/{}".format( cat, i, product_index, len(product_urls) * i)) product_url = root + product_url print("get_product_url") temp = [] try: img_url = scrapping_tools.get_img_url(product_url) except: print( "cannot find image url: index-{}, product_url: {}". format(index, product_url)) img_url = 'null' print('get_img_url') try: product_description = scrapping_tools.get_product_description( product_url) except: print( "cannot find description: index-{}, product_url: {}" .format(index, product_url)) product_description = ['null', 'null', 'null'] print('get_product_description') try: comb_url = scrapping_tools.get_comb_url(product_url) except: print( "cannot find combinition url: index-{}, product_url: {}" .format(index, product_url)) comb_url = [] print('get comb url') temp.append(index) temp.append(product_url) temp.append(product_description[0]) temp.append(gender) temp.append(cat) temp.append(img_url) temp.append(product_description[1]) temp.append(product_description[2]) for comb in comb_url: temp.append(comb) print("appending") writer.writerow(temp) print('write') if (product_index > 1000): break if (product_index > 1000): break
def comb_scrapping(args, gender): csv_path = os.path.join(args.csv_dir, gender+'_clothes_comb copy.csv') product_url_set, product_url_dict, comb_url_set, max_index = initialize(args, gender) index = max_index url_queue = [] for url in comb_url_set: url_queue.append(url) with open(csv_path, 'a', newline = '') as csvfile: writer = csv.writer(csvfile, delimiter = ',') while(len(url_queue) > 0): product_url = url_queue.pop() product_url = re.findall(r"\".*\"", product_url)[0] product_url = product_url[1:-1] product_url = 'http://www.ssense.com' + product_url if product_url in product_url_set: continue product_url_set.add(product_url) cat = re.findall(r"-.*?/[0-9]", product_url)[0] cat = re.sub(r"-.*-", '', cat) cat = re.sub(r"/[0-9]", '', cat) unconsidered_cat = True for key in category_dict[gender].keys(): if cat in category_dict[gender][key]: unconsidered_cat = False if unconsidered_cat: continue index = index + 1 temp = [] try: img_url = scrapping_tools.get_img_url(product_url) except: print("cannot find image url: index-{}, product_url: {}".format(index, product_url)) img_url = 'null' try: product_description = scrapping_tools.get_product_description(product_url) except: print("cannot find description: index-{}, product_url: {}".format(index, product_url)) product_description=['null','null','null'] try: comb_url = scrapping_tools.get_comb_url(product_url) except: print("cannot find combinition url: index-{}, product_url: {}".format(index, product_url)) comb_url = [] temp.append(index) temp.append(product_url) temp.append(product_description[0]) temp.append(gender) temp.append(cat) temp.append(img_url) temp.append(product_description[1]) temp.append(product_description[2]) for comb in comb_url: temp.append(comb) if not comb in product_url_set: url_queue.append(comb) print(index, len(url_queue)) writer.writerow(temp)
import requests from bs4 import BeautifulSoup import re from scrapping_tools import scrapping_tools if __name__ == '__main__': product_url = "https://www.ssense.com/en-us/women/product/maison-margiela/red-knit-turtleneck-dress/3042348" temp = [] try: img_url = scrapping_tools.get_img_url(product_url) print(img_url) except: print("cannot find image url: index-{}, product_url: {}".format( index, product_url)) try: product_description = scrapping_tools.get_product_description( product_url) print(product_description) except: print("cannot find description: index-{}, product_url: {}".format( index, product_url)) product_description = ['null', 'null', 'null'] temp.append(product_url) temp.append(product_description[0]) temp.append('men') temp.append(img_url) temp.append(product_description[1]) temp.append(product_description[2]) print(temp)