示例#1
0
def scrap(url, writer, index):
	temp = []

	try:
		img_url = scrapping_tools.get_img_url(product_url)
	except:
		print("cannot find image url: index-{}, product_url: {}".format(index, product_url))
		img_url = 'null'
	print('get_img_url')

	try:
		product_description = scrapping_tools.get_product_description(product_url)
	except:
		print("cannot find description: index-{}, product_url: {}".format(index, product_url))
		product_description=['null','null','null']
	print('get_product_description')

	try:
		comb_url = scrapping_tools.get_comb_url(product_url)
	except:
		print("cannot find combinition url: index-{}, product_url: {}".format(index, product_url))
		comb_url = []
		print('get comb url')

	temp.append(index)
	temp.append(product_url)
	temp.append(product_description[0])
	temp.append(gender)
	temp.append(cat)
	temp.append(img_url)
	temp.append(product_description[1])
	temp.append(product_description[2])
	for comb in comb_url:
		temp.append(comb)
	print("appending")

	writer.writerow(temp)
	print('write')
示例#2
0
def scrap_by_gender(args, gender):
    csv_path = os.path.join(args.csv_dir, gender + "_clothes_comb.csv")
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')

        index = 0
        category_index = 0

        for cat in category[gender]:
            category_index += 1
            product_index = 0
            print(gender +
                  " category: {}/{}".format(category_index, len(men_category)))

            for i in range(1, 100):

                cat_url = root + '/en-us/' + gender + '/' + cat
                if i != 1:
                    cat_url += "?page={}".format(i)

                try:
                    product_urls = scrapping_tools.get_product_url(cat_url)
                except:
                    print("end of category: page{}".format(i + 1))
                    break

                for product_url in product_urls:
                    print(product_url)
                    product_index += 1
                    index += 1

                    if (product_index > 1000):
                        break
                    if (product_index % 10 == 0):
                        print(gender + "/{}: page{}, {}/{}".format(
                            cat, i, product_index,
                            len(product_urls) * i))

                    product_url = root + product_url

                    print("get_product_url")

                    temp = []

                    try:
                        img_url = scrapping_tools.get_img_url(product_url)
                    except:
                        print(
                            "cannot find image url: index-{}, product_url: {}".
                            format(index, product_url))
                        img_url = 'null'
                    print('get_img_url')

                    try:
                        product_description = scrapping_tools.get_product_description(
                            product_url)
                    except:
                        print(
                            "cannot find description: index-{}, product_url: {}"
                            .format(index, product_url))
                        product_description = ['null', 'null', 'null']
                    print('get_product_description')

                    try:
                        comb_url = scrapping_tools.get_comb_url(product_url)
                    except:
                        print(
                            "cannot find combinition url: index-{}, product_url: {}"
                            .format(index, product_url))
                        comb_url = []
                    print('get comb url')

                    temp.append(index)
                    temp.append(product_url)
                    temp.append(product_description[0])
                    temp.append(gender)
                    temp.append(cat)
                    temp.append(img_url)
                    temp.append(product_description[1])
                    temp.append(product_description[2])
                    for comb in comb_url:
                        temp.append(comb)
                    print("appending")

                    writer.writerow(temp)
                    print('write')

                    if (product_index > 1000):
                        break
                if (product_index > 1000):
                    break
示例#3
0
def comb_scrapping(args, gender):
	csv_path = os.path.join(args.csv_dir, gender+'_clothes_comb copy.csv')
	product_url_set, product_url_dict, comb_url_set, max_index = initialize(args, gender)
	index = max_index
	url_queue = []
	for url in comb_url_set:
		url_queue.append(url)

	with open(csv_path, 'a', newline = '') as csvfile:
		writer = csv.writer(csvfile, delimiter = ',')
		while(len(url_queue) > 0):
			product_url = url_queue.pop()
			product_url = re.findall(r"\".*\"", product_url)[0]
			product_url = product_url[1:-1]
			product_url = 'http://www.ssense.com' + product_url

			if product_url in product_url_set:
				continue

			product_url_set.add(product_url)

			cat = re.findall(r"-.*?/[0-9]", product_url)[0]
			cat = re.sub(r"-.*-", '', cat)
			cat = re.sub(r"/[0-9]", '', cat)

			unconsidered_cat = True
			for key in category_dict[gender].keys():
				if cat in category_dict[gender][key]:
					unconsidered_cat = False

			if unconsidered_cat:
				continue

			index = index + 1
			temp = []
			
			try:
				img_url = scrapping_tools.get_img_url(product_url)
			except:
				print("cannot find image url: index-{}, product_url: {}".format(index, product_url))
				img_url = 'null'

			try:
				product_description = scrapping_tools.get_product_description(product_url)
			except:
				print("cannot find description: index-{}, product_url: {}".format(index, product_url))
				product_description=['null','null','null']

			try:
				comb_url = scrapping_tools.get_comb_url(product_url)
			except:
				print("cannot find combinition url: index-{}, product_url: {}".format(index, product_url))
				comb_url = []
			
			temp.append(index)
			temp.append(product_url)
			temp.append(product_description[0])
			temp.append(gender)
			temp.append(cat)
			temp.append(img_url)
			temp.append(product_description[1])
			temp.append(product_description[2])
			for comb in comb_url:
				temp.append(comb)
				if not comb in product_url_set:
					url_queue.append(comb)

			print(index, len(url_queue))
			writer.writerow(temp)
示例#4
0
import requests
from bs4 import BeautifulSoup
import re
from scrapping_tools import scrapping_tools

if __name__ == '__main__':

    product_url = "https://www.ssense.com/en-us/women/product/maison-margiela/red-knit-turtleneck-dress/3042348"
    temp = []
    try:
        img_url = scrapping_tools.get_img_url(product_url)
        print(img_url)
    except:
        print("cannot find image url: index-{}, product_url: {}".format(
            index, product_url))
    try:
        product_description = scrapping_tools.get_product_description(
            product_url)
        print(product_description)
    except:
        print("cannot find description: index-{}, product_url: {}".format(
            index, product_url))
        product_description = ['null', 'null', 'null']

    temp.append(product_url)
    temp.append(product_description[0])
    temp.append('men')
    temp.append(img_url)
    temp.append(product_description[1])
    temp.append(product_description[2])
    print(temp)