示例#1
0
def main():
    db = MysqlHelper()
    tasks = db.get_tmall_shops()
    for task in tasks:
        crawl_tmall_list(task)

    driver.close()
示例#2
0
文件: output.py 项目: nosun/taobao
def main():
    file_path = os.path.join(DATA_PATH, 'products_tmall.csv')
    write_head(file_path)

    db = MysqlHelper()
    shops = db.get_tmall_shops()
    for shop in shops:
        products = db.get_products_by_shop(shop['id'])
        for product in products:
            line = format_line()
            line['1-sn'] = product['sn']
            line['2-title'] = product['title']
            line['3-price'] = product['price']
            line['4-prices'] = list_to_string(format_data(product['prices']))
            line['5-colors'] = list_to_string(format_data(product['colors']))
            line['6-sizes'] = list_to_string(format_data(product['sizes']))
            line['7-url'] = product['url']
            line['8-shop_name'] = product['name']
            line['9-shop_url'] = product['task_url']
            properties = format_data(product['properties'])
            properties = json.loads(properties)
            for k, v in properties.items():
                line[k] = v.replace("\"", "")
            print(line)
            new_line = line.values()
            print(new_line)
            write_list_to_csv([new_line], file_path)
示例#3
0
 def get_tasks(self):
     db = MysqlHelper()
     products = db.get_product_tasks()
     tasks = [
         product['url'] for product in products
         if product['url'] is not None
     ]
     return tasks
示例#4
0
def t_get_product():
    db = MysqlHelper()
    products = db.get_products()
    for p in products:
        images = p['images']
        s = images.replace("\\", "").replace("\'", "\"")
        s = json.loads(s)
        print(s[1]['sku'])
示例#5
0
文件: add_task.py 项目: nosun/taobao
def insert_task():
    csv_file = os.path.join(DATA_PATH, "task.csv")
    lines = read_csv(csv_file)
    db = MysqlHelper()
    for line in lines:
        task = line.split("@")[0].strip()
        print(task)
        db.add_task_url(task)
示例#6
0
文件: add_task.py 项目: nosun/taobao
def update_task():
    csv_file = os.path.join(DATA_PATH, "shop_infos.csv")
    lines = read_csv(csv_file)
    db = MysqlHelper()
    id = 1
    for line in lines:
        name, wid = line.split(",")
        db.update_task(name, wid, id)
        id = id + 1
示例#7
0
def main():
    db = MysqlHelper()
    shops = db.get_tmall_shops()
    for shop in shops:
        image_path = mk_image_dir(shop['id'])
        if shop['id'] == 26:
            continue
        products = db.get_products_by_shop(shop['id'])
        for product in products:
            images = format_data(product['images'])
            save_images(product['sn'], images, image_path)
示例#8
0
文件: output.py 项目: nosun/taobao
def get_property_keys():
    db = MysqlHelper()
    products = db.get_products()
    all_keys = set()
    for p in products:
        try:
            properties = format_data(p['properties'])
            properties = json.loads(properties)
            keys = properties.keys()
            for k in keys:
                all_keys.add(k)
        except Exception as e:
            print(p['id'])
            print(p['properties'])
    return all_keys
示例#9
0
文件: product.py 项目: nosun/taobao
    def get_tasks(self):
        db = MysqlHelper()
        shops = db.get_shops()
        tasks = []
        for shop in shops:
            wid = shop['wid']
            name = shop['name']
            task_url = shop['task_url']
            _arr = task_url.split("/")
            if len(_arr) >= 3:
                site = _arr[2]
                path = _arr[3]

                url = "https://{}/i/asynSearch.htm".format(site)

                params = {
                    "_ksTS": "replace_time_141",  # will replace later
                    "callback": "jsonp142",
                    "mid": "w-{}-0".format(wid),
                    "wid": wid,
                    "path": "/{}".format(path),
                    "search": "y",
                    "pageNo": 1
                }

                f = furl(url)
                f.args = params
                # print(f.url)
                task = dict()
                task['id'] = str(shop['id'])
                task['name'] = name
                task['site'] = site
                task['path'] = path
                task['task_url'] = task_url
                task['url'] = f.url
                tasks.append(task)
                # break  # only get one for test
            else:
                continue
        return tasks
示例#10
0
def crawl_tmall_list(task):
    url = task['task_url'] + "?pageNo=2"
    sid = task['id']

    try:
        driver.get(url)
        print("success get query success %s" % url)
    except Exception as e:
        print("failed when get query %s" % url)
        raise
    else:
        db = MysqlHelper()
        time.sleep(10)
        items_xpath = "//dl[contains(@class,'item')]"
        items = driver.find_elements_by_xpath(items_xpath)
        for item in items:
            product = TaobaoItem()
            product['thumb'] = item.find_element_by_xpath(
                "./dt//img").get_attribute('src')
            product['url'] = item.find_element_by_xpath(
                "./dt/a").get_attribute('href')
            product['title'] = item.find_element_by_xpath(
                "./dd[@class='detail']/a").get_attribute('innerText').strip()
            product['price'] = item.find_element_by_xpath(
                ".//span[@class='c-price']").get_attribute(
                    'innerText').strip()
            try:
                product['sn'] = product['url'].split("?id=")[1]
            except Exception as e:
                continue
            else:
                if "&" in product['sn']:
                    product['sn'] = product['sn'].split("&")[0]
            product['sid'] = sid
            db.upsert_products_from_list(product)
        db.db.close()
示例#11
0
 def __init__(self):
     self.lang = "zh"
     self.db = MysqlHelper()
示例#12
0
class TaobaoItemSpider(object):
    driver = None
    driver_option = None
    mysql_db = None

    def __init__(self):
        self.lang = "zh"
        self.db = MysqlHelper()

    def init_chrome_driver(self):
        """ init chrome dirver """
        options = webdriver.ChromeOptions()
        options.add_argument('--lang=' + self.lang)
        options.add_argument("--disable-notifications")
        self.driver = webdriver.Chrome(CHROME_DRIVER_PATH,
                                       chrome_options=options)
        pass

    def start(self):
        self.init_chrome_driver()

        tasks = self.get_tasks()

        for task in tasks:
            self.crawl_taobao_item(task)
            print("crawl the page finished %s" % (task['url'], ))

        self.close()

    def close(self):
        """close driver"""
        self.driver.quit()
        self.db.db.close()
        logger.info('Goodbye, The tasks is finished.')
        pass

    def get_tasks(self):
        """ get tasks from db"""
        tasks = self.db.get_product_tasks()
        return tasks

    def crawl_taobao_item(self, task):
        """
            1. crawl taobao item page
            2. get item info form page
            3. save info to database
        """
        url = task['url']
        print("begin to crawl page: %s" % url)

        try:
            self.driver.get(url)
            print("success get query success %s" % url)
        except Exception as e:
            print("failed when get query %s" % url)
        else:
            try:
                time.sleep(30)
                product = TaobaoItem()
                product['sn'] = task['sn']
                product['sizes'] = []
                product['colors'] = []
                product['images'] = []
                product['prices'] = []
                product['choices'] = dict()

                # get properties
                properties_xpath = "//ul[@id='J_AttrUL']/li"
                _arrs = self.driver.find_elements_by_xpath(properties_xpath)
                properties = dict()
                for pt in _arrs:
                    pt = pt.get_attribute("innerText")
                    _arr = pt.split(":")
                    properties[_arr[0].strip()] = _arr[1].strip()

                product['properties'] = properties

                # get images from html doc

                images_xpath = "//ul[@id='J_UlThumb']/li//img"
                _arrs = self.driver.find_elements_by_xpath(images_xpath)
                images = set()
                for i in _arrs:
                    src = "https:" + i.get_attribute("src").replace(
                        "_60x60q90.jpg", "").strip()
                    images.add(src)

                body = self.driver.find_element_by_tag_name(
                    "body").get_attribute("innerHTML")
                r = re.compile(r"(.*)TShop.Setup\((.*?)\}\)", re.S)

                matchObj = re.match(r, body, 0)
                if matchObj and len(matchObj.group()) > 1:
                    obj_json = matchObj.group(2)
                    obj_json = obj_json.strip()[:-2].strip()
                    obj_json = json.loads(obj_json)

                    # get images
                    if 'propertyPics' in obj_json.keys():
                        js_images = obj_json['propertyPics']

                        for k, v in js_images.items():
                            for i in v:
                                images.add("https:" + i)
                    product['images'] = list(images)

                    # get colors
                    sku_list = []
                    if 'valItemInfo' in obj_json.keys():
                        if 'skuList' in obj_json['valItemInfo'].keys():
                            sku_list = obj_json['valItemInfo']['skuList']
                            for sku in sku_list:
                                product['colors'].append(sku['names'].strip())

                    # get prices
                    if 'valItemInfo' in obj_json.keys():
                        if 'skuMap' in obj_json['valItemInfo'].keys():
                            sku_maps = obj_json['valItemInfo']['skuMap']
                            prices = set()
                            for k, v in sku_maps.items():
                                prices.add(v['price'])
                            product['prices'] = list(prices)

                            # get choices
                            product['choices'] = self.format_sku(
                                sku_list, sku_maps)
                print(product)
                self.db.update_product(product)

            except Exception as e:
                print(e)
                raise e

    def format_sku(self, sku_list, sku_maps):
        new_sku_list = dict()
        for sku in sku_list:
            new_sku_list[sku['pvs']] = sku["names"].strip()

        for k, v in sku_maps.items():
            if k.strip(";") in new_sku_list.keys():
                sku_maps[k]['color'] = new_sku_list[k.strip(";")]

        return sku_maps