예제 #1
0
class HandleData(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def handle_category(self):
        category_li = self.__handle_category()
        self.__handle_category_data(category_li)

    @classmethod
    def __handle_category(cls):
        with open("tmp/category_data.txt", "rb") as f:
            page_resource = f.read().decode("utf-8")
            f.close()
        bs_data = BeautifulSoup(page_resource, "html.parser")
        category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"})
        # only get the next level's li(tag), not include offspring(need to add 'recursive=False')
        return category_ul[0].find_all("li", recursive=False)

    def __handle_category_data(self, category_li, handle_type=1, parent_id=0):
        table_columns = (("id", "int"), ("name", "varchar"),
                         ("page_num", "longtext"), ("nav_type", "int"),
                         ("keyword", "varchar"), ("parent_id", "int"))
        for item in category_li:
            insert_arr = {"parent_id": 0, "nav_type": 2}
            try:
                href = item.find("a").attrs['href']
                try:
                    insert_arr['keyword'] = re.findall('category=([\w\W]*.)',
                                                       href)[0]
                except Exception as e:
                    debug(e)
                if handle_type == 2:
                    insert_arr['parent_id'] = parent_id
                if href == "#":
                    insert_arr['name'] = item.find("span").getText().strip()
                    insert_arr['nav_type'] = 1
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    lastest_id = self.db.insertLastId(sql, is_close_db=False)
                    if lastest_id == 0:
                        debug("get data error")
                        continue
                    self.__handle_category_data(item.find_all("li"), 2,
                                                lastest_id)
                else:
                    insert_arr['name'] = item.getText().strip()
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    self.db.insert(sql, is_close_db=False)
            except Exception as e:
                debug(e)
예제 #2
0
class RecipeContentThread(object):
    def __init__(self):
        self.recipe_list = RecipeListSpider()
        self.table_columns = (("id", "int"), ("img_url", "varchar"),
                              ("video_id", "varchar"), ("preparation",
                                                        "longtext"),
                              ("ingredients", "text"), ("name", "varchar"),
                              ("list_id", "int"))
        self.handle_num = 0
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def run(self):
        """
        :return:
        """
        data = self.get_list()
        self.start(data)

    def start(self, data):
        """
        :param data:
        :return:
        """
        thread_pool = ThreadPoolExecutor(max_workers=15)
        task_list = list()
        for item in data:
            task = thread_pool.submit(self.handle_data, item)
            task_list.append(task)
        for i in as_completed(task_list):
            result = i.result()
        debug("本次处理成功 {num} 个线程".format(num=self.handle_num))

    def get_list(self):
        """
        :return:
        """
        data = self.recipe_list.get_list()
        return data

    def handle_data(self, item):
        """
        :param item:
        :return:
        """
        page_resource = self.get_data(item)
        result = self.__handle_data(page_resource, item)
        if result['code'] == 0:
            debug("菜谱存储出错 --> {name}".format(name=item['name']))
        else:
            debug("菜谱存储成功 --> {name}".format(name=item['name']))
            self.__update_status(result['code'])
            lock.acquire()
            self.handle_num = self.handle_num + 1
            lock.release()
        return {"code": 0}

    @classmethod
    def get_data(cls, item):
        """
        :param item:
        :return:
        """
        url = CommonFunc().generate_content_url(item['url'])
        data = curlData(url, open_virtual_ip=True)
        return data

    def __handle_data(self, page_resource, item):
        """
        :param page_resource:
        :param item:
        :return:
        """
        bs = BeautifulSoup(page_resource, "html.parser")
        insert_arr = dict()
        insert_arr['video_id'] = self.__get_video_id(bs)
        insert_arr['img_url'] = self.__get_img_url(page_resource)
        insert_arr['name'] = item['name']
        insert_arr['preparation'] = self.__get_preparation(bs)
        insert_arr['ingredients'] = self.__get_ingredients(bs)
        insert_arr['list_id'] = item['id']
        result = self.__save_data(insert_arr)
        return {'code': result}

    def __save_data(self, insert_arr):
        lock.acquire()
        sql = self.db.getInsertSql(insert_arr,
                                   table="content",
                                   table_columns=self.table_columns)
        result = self.db.insertLastId(sql, is_close_db=False)
        lock.release()
        return result

    def __update_status(self, recipe_list_id):
        update_arr = {
            "table":
            "list",
            "set": {
                "status": "1"
            },
            "condition":
            ["id={recipe_list_id}".format(recipe_list_id=recipe_list_id)]
        }
        lock.acquire()
        self.db.update(update_arr, is_close_db=False)
        lock.release()

    @classmethod
    def __get_video_id(cls, bs):
        """
        :param bs:
        :return:
        """
        video_id = bs.find("iframe")
        try:
            video_id = video_id.attrs['src']
            video_id = re.findall('https://www.youtube.com/embed/([\w\W]*?)\?',
                                  video_id)[0]
        except Exception as e:
            video_id = ""
            debug("视频播放id获取出错,错误信息:{error}".format(error=e))
        return video_id

    @classmethod
    def __get_img_url(cls, bs):
        # img_url = bs.find("div", attrs={"class": "ytp-cued-thumbnail-overlay-image"})
        try:
            # img_url = img_url.attrs['style']
            img_url = re.findall('"image": "([\w\W]*?)"', str(bs))[0]
        except Exception as e:
            img_url = ""
            debug("菜谱图片链接获取出错,错误信息:{error}".format(error=e))
        return img_url

    @classmethod
    def __get_preparation(cls, bs):
        """
        :param bs:
        :return:
        """
        preparation = bs.find("div",
                              attrs={"class": "cs-recipe-single-preparation"})
        try:
            preparation = preparation.find("ul")
            preparation = str(preparation)
            preparation = re.findall("<ul>([\w\W]*?)<\/ul>", preparation)[0]
        except Exception as e:
            preparation = ""
            debug("菜谱做法获取出错,错误信息:{error}".format(error=e))
        return preparation

    @classmethod
    def __get_ingredients(cls, bs):
        ingredients = bs.find("div",
                              attrs={"class": "cs-ingredients-check-list"})
        ingredients_str = ""
        try:
            ingredients = ingredients.find("ul")
            ingredients = ingredients.find_all("li")
            for k, v in enumerate(ingredients):
                if k == 0:
                    ingredients_str = ingredients_str + v.get_text().strip()
                else:
                    ingredients_str = ingredients_str + "," + v.get_text(
                    ).strip()
        except Exception as e:
            ingredients_str = ""
            debug("配料获取出错,错误信息:{error}".format(error=e))
        return ingredients_str
예제 #3
0
class RecipeType(object):
    def __init__(self):
        self.db = DBConfig()

    def __del__(self):
        self.db.closeDB()

    def get(self):
        """
        :return:
        """
        select_arr = {"table": "type", "condition": ['nav_type=2']}
        data = self.db.select(select_arr, is_close_db=False)
        # check whether have any data, if not, get all category
        if not data:
            self.get_recipe_type()
            data = self.db.select(select_arr, is_close_db=False)
        return data

    def get_category(self):
        """
        :return:
        """
        select_arr = {"table": "type", "condition": ['status=0']}
        return self.db.select(select_arr, is_close_db=False)

    def get_recipe_type(self):
        """
        :return:
        """
        category_li = self.__handle_category()
        self.__handle_category_data(category_li)

    @classmethod
    def __handle_category(cls):
        """
        :return:
        """
        url = CommonFunc().generate_url()
        page_resource = curlData(url, open_virtual_ip=True)
        bs_data = BeautifulSoup(page_resource, "html.parser")
        category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"})
        # only get the next level's li(tag), not include offspring(need to add 'recursive=False')
        return category_ul[0].find_all("li", recursive=False)

    def __handle_category_data(self, category_li, handle_type=1, parent_id=0):
        """
        :param category_li:
        :param handle_type:
        :param parent_id:
        :return:
        """
        table_columns = (("id", "int"), ("name", "varchar"),
                         ("page_num", "longtext"), ("nav_type", "int"),
                         ("keyword", "varchar"), ("parent_id", "int"))
        for item in category_li:
            insert_arr = {"parent_id": 0, "nav_type": 2}
            try:
                href = item.find("a").attrs['href']
                try:
                    insert_arr['keyword'] = re.findall('category=([\w\W]*.)',
                                                       href)[0]
                except Exception as e:
                    debug(e)
                if handle_type == 2:
                    insert_arr['parent_id'] = parent_id
                if href == "#":
                    insert_arr['name'] = item.find("span").getText().strip()
                    insert_arr['nav_type'] = 1
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    lastest_id = self.db.insertLastId(sql, is_close_db=False)
                    if lastest_id == 0:
                        debug("get data error")
                        continue
                    self.__handle_category_data(item.find_all("li"), 2,
                                                lastest_id)
                else:
                    insert_arr['name'] = item.getText().strip()
                    sql = self.db.getInsertSql(insert_arr,
                                               "type",
                                               table_columns=table_columns)
                    self.db.insert(sql, is_close_db=False)
            except Exception as e:
                debug(e)