Exemplo n.º 1
0
class PixivSpider():
    def load_config(self):
        # 加载配置文件
        self.config = ConfigureUtil('config/config.conf')
        try:
            self.thread_num = self.config.get("app", "thread_num", type_="int")
            # 下载图片存储的根目录
            self.root_path = self.config.get("download", "path")
            # 下载循环等待的时间
            self.waiting_time = self.config.get("download",
                                                "waiting_time",
                                                type_="int")
        except Exception as e:
            self.logger.error("请检查你配置的下载路径====》{}".format(repr(e)),
                              exc_info=True)
            raise e

    def load_logger(self):
        self.logger = Log(__name__).get_log()

    def __init__(self):
        # 加载配置文件
        self.sql_4_update = 'update illuster set priority = 0,  modify_time = "{}" WHERE illuster_id  = {}'
        self.load_logger()
        self.load_config()
        self.pixiv_util = PixivUtil()
        self.db_util = DbUtil()
        self.pool = threadpool.ThreadPool(self.thread_num)
        self.WAITING_SQL = 'select illuster_id from illuster WHERE priority > 0 ' \
                           ' AND illuster_id!=11 ORDER BY priority DESC LIMIT {} '
        ssl._create_default_https_context = ssl._create_unverified_context
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

        #  测试。用来解决, 'ssl3_read_bytes', 'sslv3 alert bad record mac')
        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'

    def login(self):
        is_login = self.pixiv_util.login()
        if not is_login:
            self.logger.info("登陆失败,请检查账号密码是否正确和网络是否连通")
            return False
        else:
            return True

    def save_concerned_illuster_info(self, type_: str):
        """
        根据用户id找到用户关注的画师,保存到数据库
        :param type_: hide或者show hide表示未公开,show表示已公开
        :return:
        """
        if not self.login():
            return
        page_num = self.pixiv_util.get_concerned_illusters_pagenum(type_)
        for current_page_num in range(1, page_num + 1):
            ids_, profiles, usernames = self.pixiv_util.get_concerned_illuster_info(
                type_, current_page_num)
            for illuster_id, profile, username in zip(ids_, profiles,
                                                      usernames):
                res = self.db_util.get_one(
                    "select id from illuster where illuster_id = " +
                    illuster_id)
                str_time = get_time()
                if res:
                    self.logger.debug("该画师已存在" + illuster_id)
                    self.db_util.update(
                        'UPDATE illuster SET name=%s, image_url=%s, modify_time=%s, priority=%s '
                        'WHERE illuster_id =%s',
                        (username, profile, str_time, 0, illuster_id))
                else:
                    self.db_util.insert(
                        "insert into illuster(illuster_id,name,image_url,create_time,modify_time,priority) "
                        "value(" + illuster_id + ",'" + username + "','" +
                        profile + "','" + str_time + "','" + str_time + "',5)")
        self.logger.info("信息保存完毕")

    def get_illuster_ids(self, num, use_data_base=False):
        """
        获得num个未爬取的画家id
        优先使用配置文件中的画家id,然后是数据库中待爬取的
        """
        ids = self.config.get("download", "illuster_ids")
        ids = [] if ids is None else ids.split()
        if len(ids) > num - 1:
            return ids[:10]
        elif use_data_base:
            illuster_ids = self.db_util.get_all(self.WAITING_SQL.format(10))
            for illuster_id in illuster_ids:
                ids.append(str(illuster_id[0]))
            temp = list(set(ids))[:10]
            temp.sort(key=ids.index)
            return temp
        else:
            return ids[:10]

    def main(self, batch_size=10, use_database=False):
        """
        自动读取配置文件或者数据库,获得需要爬取的作家的id
        配置文件要更加优先
        找到画家id爬取画家的作品
        """
        while True:
            illuster_ids = self.get_illuster_ids(batch_size, use_database)
            if len(illuster_ids) == 0:
                time.sleep(60 * 60 * 4)
            for illuster_id in illuster_ids:
                self.logger.info("正在下载的是" + illuster_id + "的作品")
                if use_database:
                    res = self.db_util.get_one(
                        "select id from illuster where illuster_id = %s",
                        illuster_id)
                    # 爬取画师有关信息
                    str_time = get_time()
                    # 先判断画师是否存在于pixiv网站
                    # is_exist = self.pixiv_util.is_illuster_exist(illuster_id)
                    info = self.pixiv_util.get_illuster_info(illuster_id)
                    if info is None:
                        self.db_util.insert(
                            "update illuster set priority = -1 WHERE illuster_id = %s",
                            illuster_id)
                        continue
                    else:
                        name, img_url = info
                    if res:
                        self.logger.debug("该画师已存在数据库中" + illuster_id)
                        self.db_util.update(
                            "update illuster set name = %s, image_url = %s, modify_time = %s where illuster_id = %s",
                            (name, img_url, str_time, illuster_id))
                    else:
                        self.db_util.insert(
                            "insert into illuster(illuster_id, name, image_url, create_time,modify_time,priority)"
                            " value(%s, %s, %s, %s, %s,5)",
                            (illuster_id, name, img_url, str_time, str_time))
                dir_ = os.path.join(self.root_path, illuster_id)
                make_directory(dir_)
                illust_ids = self.pixiv_util.get_illust_ids(illuster_id)
                if illuster_ids is None:
                    self.logger.error(
                        '获得插画列表失败,画师id为{}, 开始下一个画师的信息爬取'.format(illuster_id))
                    continue
                illust_num = len(illust_ids)
                self.logger.info("共有插图%d个" % (illust_num))
                if use_database:
                    if res:
                        self.logger.debug("该画师已存在数据库中" + illuster_id)
                        self.db_util.update(
                            "update illuster set name = %s, image_url = %s, modify_time = %s, illust_num = %s where illuster_id = %s",
                            (name, img_url, str_time, illust_num, illuster_id))
                    else:
                        self.db_util.insert(
                            "insert into illuster(illuster_id, name, image_url, create_time,modify_time,illust_num, priority)"
                            " value(%s, %s, %s, %s, %s, %s, 5)",
                            (illuster_id, name, img_url, str_time, str_time,
                             illust_num))
                if illust_num > 0:
                    if use_database:
                        illust_ids = self.pixiv_util.filter_4_downloaded_work(
                            illust_ids)
                    # 准备循环的数据
                    var_list = []
                    for illust_id in illust_ids:  # 循环的到的illust_id
                        var_list.append(([dir_, illust_id], None))
                    if self.pool is None:
                        self.pool = threadpool.ThreadPool(self.thread_num)
                    tasks = threadpool.makeRequests(
                        self.pixiv_util.download_work_by_illust_id, var_list)
                    [self.pool.putRequest(task) for task in tasks]
                    self.pool.wait()

                # 直接删除第一个id,因为第一个id最先下载
                il_ids = self.config.get("download", "illuster_ids")
                if il_ids is not None:
                    self.config.update("download", "illuster_ids",
                                       " ".join(il_ids.split()[1:]))
                if use_database:
                    self.db_util.update(
                        self.sql_4_update.format(get_time(), illuster_id))
                self.logger.info("{}的作品下载完成".format(illuster_id))
            self.logger.info('one batch is over')
            time.sleep(self.waiting_time)
        self.logger.info("所有的作品下载完成")
        session.close()
Exemplo n.º 2
0
class Downloader:
    def __init__(self, path):
        self.db_util = MysqlUtil()
        self.spdier_util = SpiderUtil()
        self.root_path = path
        self.logger = Log(__name__).get_log()

    def get_info(self, select_sql):
        """
        根据sql查询要爬取的数据
        """
        res = self.db_util.get_all(select_sql)
        infos = []
        for info in res:
            type_ = info[3]
            if type_ == IllustType.ugoira.value:
                # todo 只下载zip文件,之后再处理缩略图
                url = Downloader.handle_url_4_ugoira(info[0])
            else:
                url = str(info[0], encoding='utf-8')
                if '.gif' in url:
                    # 暂不处理gif类型
                    continue
                if '.jpg' in url or '.png' in url:
                    pass
                else:
                    self.logger.warning('不支持的类型!illust_id为{}'.format(info[1]))
            infos.append((url, self.get_path(url,
                                             str(info[2])), info[4], info[1]))
        return infos

    def get_path(self, url: str, illuster_id: str):
        path = os.path.join(self.root_path, illuster_id)
        make_directory(path)
        return os.path.join(path, url.split("/")[-1])

    @staticmethod
    def handle_url_4_ugoira(url):
        url = str(url, encoding="utf-8")
        i = url.replace('img-original', 'img-zip-ugoira')
        i = i.replace('0.jpg', '1920x1080.zip')
        i = i.replace('0.png', '1920x1080.zip')
        return i

    def download_file(self, url, save_path, headers):
        res = self.spdier_util.download_file(url, save_path, headers)
        if res == 404:
            self.logger.error('无法找到文件,url为{}'.format(url))
            return 404
        if res and valid_file(save_path, file_extension(save_path)):
            return True
        else:
            return False

    def download_file_list(self, url, save_path, headers, page_num: int,
                           illust_id):
        """当一个illust_id下的所有插画都下载完成才更新状态"""
        res = True
        for i in range(page_num):
            url_temp = url.replace('_p0', '_p{}'.format(i))
            save_path_temp = save_path.replace('_p0', '_p{}'.format(i))
            res = res and self.download_file(url_temp, save_path_temp, headers)
            if res == 404:
                self.db_util.update(
                    'UPDATE illust SET status = {} WHERE illust_id = {}'.
                    format(WorkStatus.unfound.value, illust_id))
                return
        if res:
            self.db_util.update(
                'UPDATE illust SET status = {} WHERE illust_id = {}'.format(
                    WorkStatus.complete.value, illust_id))

    def main(self, select_sql, headers, thread_num, interval):
        """
        查询url数据,然后自动下载
        :param select_sql:  查询数据的sql语句
        :param headers:  请求头设置
        :param thread_num:  下载线程数
        :param interval:  循环查询数据库的时间间隔
        :return:
        """
        pool = None
        while True:
            infos = self.get_info(select_sql)
            print('get info from database')
            if infos:
                param_list = [([i[0], i[1], headers, i[2], i[3]], None)
                              for i in infos]
                if pool is None:
                    pool = threadpool.ThreadPool(thread_num)
                tasks = threadpool.makeRequests(self.download_file_list,
                                                param_list)
                [pool.putRequest(task) for task in tasks]
                pool.wait()
            self.logger.info('one batch is over')
            print('one batch is over')
            time.sleep(interval)
Exemplo n.º 3
0
class PixivUtil:
    def __init__(self):
        self.init_pattern()
        self.load_config()

        self.logger = Log(__name__).get_log()
        self.GET_KEY_PAGE = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index"
        self.LOGIN_PAGE = "https://accounts.pixiv.net/api/login?lang=zh"  # 登陆页面

        self.UGOIRA_URL = 'https://www.pixiv.net/ajax/illust/{illust_id}/ugoira_meta'  # 动图url,0填写动图的id
        self.URL_4_GET_ALL_WORK_ID = "https://www.pixiv.net/ajax/user/{userId}/profile/all"  # 画师的所有作品信息
        self.URL_4_ILLUSTER_MAIN_PAGE = "https://www.pixiv.net/ajax/user/{}/profile/top"  # 画师主页24个作品信息,少量画师信息
        self.URL_4_GET_ILLUSTER_INFO = "https://www.pixiv.net/ajax/user/{}?full=1"  # 用于获得画师信息
        self.URL_ILLUST_PAGE = "https://www.pixiv.net/member_illust.php?mode=medium&illust_id={illust_id}"  # 插画页面
        self.URL_ILLUST_PAGE = "https://www.pixiv.net/ajax/illust/{illust_id}"  # 插画页面

        self.REFERER = "https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index"
        self.USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"

        self.headers = {'Referer': self.REFERER, 'User-Agent': self.USER_AGENT}
        # 方法二:设置请求失败之后重复请求次数
        # requests.adapters.DEFAULT_RETRIES = self.max_retries

        self.session = requests.Session()
        if self.cookie is not None:
            self.session.cookies.set("authentication", self.cookie)
        if self.proxies['http'] is not None or self.proxies[
                'https'] is not None:
            self.session.proxies = self.proxies  # 设置session默认代理

        # 方法1:设置请求失败之后重复请求次数
        request_retry = requests.adapters.HTTPAdapter(
            max_retries=self.max_retries)
        self.session.mount('https://', request_retry)
        self.session.mount('http://', request_retry)

        # self.session.keep_alive = False

        self.spider_util = SpiderUtil()
        #  操作数据库
        self.db_util = DbUtil()
        self.sql_4_insert = 'insert into illust(title,url,illust_id,illuster_id,page_no,status,`restrict`,x_restrict)' \
                            'values ( %s, %s, %s, %s, %s, 0, %s, %s)'
        self.sql_4_insert_4_ugoira = 'insert  into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, %s,444)'
        self.sql_4_insert_2_done = 'insert  into illust(title,url,illust_id,illuster_id,page_no,status)values( %s, %s, %s, %s, 0, 10)'

    def get_session(self):
        if self.login():
            return self.session
        else:
            self.logger.error("登陆失败,请检查账号密码是否正确和网络是否连通")
            return 'error'

    def load_config(self):
        config = ConfigureUtil('config/config.conf')
        http_proxy = config.get('proxy', 'http', is_error=True)
        https_proxy = config.get('proxy', 'https', is_error=True)
        self.proxies = {'http': http_proxy, 'https': https_proxy}
        # 重试次数必须是数值类型
        self.max_retries = config.get('app', 'max_retries', 'int')
        #  超时时间, 单位是秒
        self.timeout = config.get('app',
                                  'time_out',
                                  type_="int",
                                  is_error=True,
                                  default=20)
        self.username = config.get('account', 'username')
        self.password = config.get('account', 'password')
        # self.cookie = "p_ab_id=0; p_ab_id_2=3; login_ever=yes; a_type=0; b_type=1; first_visit_datetime_pc=2018-06-06+10%3A53%3A26; _ga=GA1.2.1555034815.1497772914; p_ab_d_id=1974173592; yuid_b=NXcmaYM; module_orders_mypage=%5B%7B%22name%22%3A%22sketch_live%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22following_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22tag_follow%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22recommended_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22everyone_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22mypixiv_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22fanbox%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22featured_tags%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22contests%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22user_events%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22sensei_courses%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22spotlight%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22booth_follow_items%22%2C%22visible%22%3Atrue%7D%5D; ki_r=; __utmc=235335808; OX_plg=pm; c_type=26; __utmv=235335808.|2=login%20ever=yes=1^3=plan=normal=1^5=gender=male=1^6=user_id=27971320=1^9=p_ab_id=0=1^10=p_ab_id_2=3=1^11=lang=zh=1; gsScrollPos-422=; _td=5e2a3ce8-e5ca-4e1c-9bc5-4188deff8d9d; ki_s=197685%3A0.0.0.0.0; gsScrollPos-73=; gsScrollPos-74=0; __utmz=235335808.1558931621.48.3.utmcsr=accounts.pixiv.net|utmccn=(referral)|utmcmd=referral|utmcct=/login; limited_ads=%7B%22responsive%22%3A%22%22%7D; PHPSESSID=27971320_a3ba646dd6d6fe402a566008ca109afe; privacy_policy_agreement=1; ki_t=1539786814040%3B1564299004430%3B1564299004430%3B7%3B11; categorized_tags=0Gd_u69FDa~0VZuk18GJB~0roTzTKxJy~7cMRrOPRjW~BU9SQkS-zU~EQHKvBDRBz~IVwLyT8B6k~Ig5OcZugU6~OEXgaiEbRa~OT-C6ubi9i~RcahSSzeRf~RkTaP3d-E6~_-agXPKuAQ~b8b4-hqot7~cpt_Nk5mjc~jYnWl04aAC~l2rugVKl6u~r70NVOGJ5H~xlfjJKgpwx~y8GNntYHsi; __utmt=1; __utma=235335808.1555034815.1497772914.1564650150.1564650214.63; tag_view_ranking=0xsDLqCEW6~Ie2c51_4Sp~BU9SQkS-zU~y8GNntYHsi~RTJMXD26Ak~AI_aJCDFn0~RcahSSzeRf~i83OPEGrYw~8HRshblb4Q~4-_9de7LBH~cpt_Nk5mjc~q3eUobDMJW~skx_-I2o4Y~faHcYIP1U0~404yEt28rv~fFjokb4ZCF~-YeeMY1Yjs~FqVQndhufZ~tgP8r-gOe_~0HA6x-6rNd~Ow9mLSvmxK~KN7uxuR89w~5oPIfUbtd6~NpsIVvS-GF~gooMLQqB9a~Lt-oEicbBr~gpglyfLkWs~jYnWl04aAC~VIOKa7rioU~HBlflqJjBZ~zyKU3Q5L4C~TWrozby2UO~BB4jge2y2O~nyZIqZI1jx~MM6RXH_rlN~3gc3uGrU1V~LJo91uBPz4~laE3IylUE6~_hSAdpN9rx~n7YxiukgPF~l2rugVKl6u~A0c1GtjhvT~KhVXu5CuKx~WlKkwEuUi0~VbPCYJXdEP~2-RXlHt092~M2vKPRxAge~ehP5NJ0cy5~RokSaRBUGr~eVxus64GZU~vFXX3OXCCb~p2LP_MNOlh~pSgdr8bSLW~lhJLvPIIlV~RybylJRnhJ~T4PSuIdiwS~T53qL7THLZ~vSWEvTeZc6~iVTmZJMGJj~4i9bTBXFoE~rOnsP2Q5UN~r70NVOGJ5H~P5glpXg6VU~ie0shhAARr~JmNHQca4Km~ouiK2OKQ-A~K_WSdFXjg4~nrFOQYIh7z~7cMRrOPRjW~EWR7JDW6jH~tw8Zob-Izr~sAwDH104z0~Sbp1gmMeRy~JXmGXDx4tL~j3leh4reoN~C9_ZtBtMWU~pnCQRVigpy~dx7ljrJnxj~0roTzTKxJy~trfda46Fk8~yIg4ditfn_~XEuS3TPyCa~v3nOtgG77A~2XSW7Dtt5E~2EpPrOnc5S~QnLUXjsTk6~o2vM33GyaO~o3o9P--kXx~JL8rvDh62i~hQUvXSyZW-~LBMc5qP5TM~xlfjJKgpwx~zIv0cf5VVk~Z9XB6vYxvi~mIBxNOpKNs~B_OtVkMSZT~gmYaY_jsM2~CiSfl_AE0h~QEgdaUlAgu~JVA9YTPBgb; __utmb=235335808.3.10.1564650214"
        self.cookie = config.get('account', 'cookie', is_error=True)

    def init_pattern(self):
        self.illuster_id_from_user = re.compile(
            '<input name="id\[\]" value="(.*?)" type="checkbox"')
        self.illuster_profile_from_user = re.compile(
            'data-profile_img="(.*?)"')
        self.illuster_username_from_user = re.compile(
            '"data-user_name="(.*?)"></a>')
        self.pagenum_4_show = re.compile(
            '<a href="\?type=user&amp;rest=show&amp;p=(.*?)">')
        self.pagenum_4_hide = re.compile(
            '<a href="\?type=user&amp;rest=hide&amp;p=(.*?)">')
        # 用于获得作品详情
        self.get_illust_detail = re.compile(
            '}\)\((.*?)\);</script><link rel="apple-touch-icon"')

    def set_logger(self, logger):
        self.logger = logger

    def download_work_by_illust_id(self,
                                   save_dir,
                                   illust_id: str,
                                   use_databse=False):
        if use_databse:
            res = self.db_util.get_one(
                "select status from illust where illust_id = " + illust_id)
            if res is not None:
                if res[0] == WorkStatus.done.value:  # 已被下载好
                    self.logger.info(" 本插画已被下载 illust_id 是 " + illust_id)
                    return True
        # 获得插画信息
        illust_info = self.get_img_info_by_img_id(illust_id)
        if illust_info is None:
            return
        (title, img_url, page_count, restrict, x_restrict, illust_type,
         illuster_id) = illust_info
        # 判断文件类型
        if illust_type == IllustType.illust.value:
            self.download_illust(save_dir, illust_id, illuster_id, title,
                                 img_url, page_count, restrict, x_restrict)
        elif illust_type == IllustType.ugoira.value:  # 新遇到的动图
            self.logger.warning("遇到ugoira插画,id为{}".format(illust_id))
            # return
            self.download_ugoira(save_dir, illust_id, illuster_id, title,
                                 img_url, restrict, x_restrict)
        elif illust_type == IllustType.manga.value:
            self.download_manga(illust_id, illuster_id, title, img_url,
                                page_count, restrict, x_restrict)
        else:
            self.logger.info("暂不支持该类型的下载")

    def download_illust(self,
                        save_dir,
                        illust_id,
                        illuster_id,
                        title,
                        img_url,
                        page_count,
                        restrict,
                        x_restrict,
                        use_database=False):
        res = True
        for i in range(page_count):
            try:
                temp = img_url.replace('_p0', '_p' + str(i))
                # self.insert_illust(illust_id, IllustType.illust.value, title, img_url,
                #                    illuster_id, page_count, restrict, x_restrict)
                res = res and self.spider_util.download_img(
                    temp, save_dir, header=self.headers)
            except Exception as e:
                res = False
                self.logger.error("下载失败" + repr(e) + "url是" + temp,
                                  exc_info=True)
                break
        if use_database:
            if res:
                self.insert_illust(illust_id, title, img_url, illuster_id,
                                   page_count, restrict, x_restrict,
                                   WorkStatus.done.value)
            else:
                # 数据库中保存下载失败的记录
                self.insert_illust(illust_id, title, img_url, illuster_id,
                                   page_count, restrict, x_restrict,
                                   WorkStatus.failure.value)

    def download_illust_o(self, save_dir, illust_id, illuster_id, title,
                          img_url, page_count, restrict, x_restrict):
        # temp = img_url.split("_p0")
        res = True
        for i in range(page_count):
            # img_url = ""
            try:
                img_url = img_url.replace('_p0', '_p' + str(i))
                # img_url = temp[0] + "_p" + str(i) + temp[1]
                self.insert_illust(illust_id, IllustType.illust.value, title,
                                   img_url, illuster_id, page_count, restrict,
                                   x_restrict)
                res = res and self.spider_util.download_img(
                    img_url, save_dir, header=self.headers)
            except Exception as e:
                res = False
                self.logger.error("下载失败" + repr(e) + "url是" + img_url,
                                  exc_info=True)
                break
        if res:
            self.db_util.update(
                'update illust set status = %s where illust_id = %s',
                (WorkStatus.done.value, illust_id))

    def filter_4_downloaded_work(self, illust_id_list):
        """查询数据库,看是否有对应的作品已经被下载完成了"""
        str_illust_id = ",".join(
            str(illust_id) for illust_id in illust_id_list)
        illust_sql = 'select illust_id from illust WHERE status = {} AND illust_id in ({})' \
            .format(WorkStatus.done.value, str_illust_id)
        # ugoira_sql = 'select ugoira_id from ugoira WHERE status = {} AND ugoira_id in ({})' \
        #     .format(WorkStatus.done.value, str_illust_id)
        res_in_illust = self.db_util.get_all(illust_sql)
        # res_in_ugoira = self.db_util.get_all(ugoira_sql)
        downloaded_illust_ids = [i[0] for i in res_in_illust]
        # res_in_work.append([i[0] for i in res_in_ugoira])
        for illust_id in downloaded_illust_ids:
            if str(illust_id) in illust_id_list:
                illust_id_list.remove(str(illust_id))
        return illust_id_list
        # not_downloaded_work_ids = []
        # for i in illust_id_list:
        #     if int(i) not in downloaded_illust_ids:
        #         not_downloaded_work_ids.append(i)
        # return not_downloaded_work_ids

    def get_postkey(self, url):
        """获取需要post的数据postkey"""
        pat = 'name="post_key" value="(.*?)"'
        # 不用组装headers也能拿到postKey,但是一定要是get,不能使用post
        content = self.session.get(url, timeout=15).text
        res_temp = re.findall(pat, content)
        if len(res_temp) != 1:
            self.logger.error("无法获得postKey", exc_info=True)
            return None
        postkey = re.findall(pat, content)[0]
        self.logger.info("your post key is " + postkey)
        return postkey

    def login(self):
        """模拟登陆"""
        try:
            postkey = self.get_postkey(self.GET_KEY_PAGE)
            if not postkey:
                return False
            post_data = \
                {
                    "pixiv_id": self.username,
                    "password": self.password,
                    # "captcha": "",
                    # "g_recaptcha_response":"",
                    "post_key": postkey,
                    # "source":"pc",
                    "ref": "wwwtop_accounts_index",
                    "return_to": "https://www.pixiv.net/"
                }
            # 装个头,能解决一些问题
            result = self.session.post(self.LOGIN_PAGE,
                                       data=post_data,
                                       headers=self.headers,
                                       cookies={"cookies": self.cookie})
            pat = '"body":{"(.*?)"'
            is_login_flag = re.findall(pat, result.text)[0]
            if is_login_flag == 'success':
                self.logger.info("Log in successfully.Your username is " +
                                 self.username)
                return self.session
            else:
                self.logger.info("Login failed")
                return False
        except Exception as e:
            self.logger.error('连接无响应', exc_info=True)
            return False

    def get_illust_ids(self, illuster_id):
        """
        根据画师Id获得画师的插画IdList
        插画包括动图和静态图
        :param illuster_id: 画师id
        :return:
        """
        try:
            # 获得所有的图片Id
            illust_id_json = self.get(
                self.URL_4_GET_ALL_WORK_ID.format(userId=illuster_id)).text
            illust_id_json = str2json(illust_id_json)
            ill_ids = illust_id_json["body"]["illusts"]
            # 示例:{[illust_id]:[illust_info],[illust_id]:[illust_info],[illust_id]:[illust_info]...}
            if ill_ids != []:
                return list(ill_ids.keys())
            else:
                return []
        except Exception as e:
            self.logger.error("搜索画师失败!" + repr(e) + "illuster_id是" +
                              illuster_id,
                              exc_info=True)
            return None

    def get_manga_ids(self, illuster_id):
        """
        根据画师Id获得画师的漫画Id
        :param illuster_id:
        :return:
        """
        illust_id_json = self.get(
            self.URL_4_GET_ALL_WORK_ID.format(
                userId=illuster_id)).text  # 获得所有的图片Id
        illust_id_json = str2json(illust_id_json)
        manga_ids = illust_id_json["body"]["manga"]
        return list(manga_ids.keys())

    def get_img_info_by_img_id(self, illust_id):
        """
        根据插画或漫画的Id获得插画或者漫画的详细信息
        :param img_id:
        :return:
        """
        try:
            # info = self.session.get(self.URL_ILLUST_PAGE.format(illust_id=illust_id), timeout=self.timeout,
            #                         cookies={"cookies": self.cookie}).text
            info = self.get(
                self.URL_ILLUST_PAGE.format(illust_id=illust_id)).text
            illust_info_json = str2json(info)
            if self.isError(illust_info_json):
                self.logger.error(
                    "找不到illust信息,illust_id is {},error message is {}".format(
                        illust_id, illust_info_json['message']),
                    exc_info=True)
                return None
            body = illust_info_json['body']
            illust_type = body['illustType']
            page_count = body['pageCount']
            restrict = body['restrict']
            x_restrict = body['xRestrict']
            title = body['title']
            url = body['urls']['original']
            illuster_id = body['userId']
            return title, url, page_count, restrict, x_restrict, illust_type, illuster_id
        except Exception as e:
            self.logger.error("获取插画信息失败" + repr(e) + "illust_id是" + illust_id,
                              exc_info=True)
            return None

    # def is_illuster_exist(self, illuster_id):
    #     try:
    #         illust_id_json = self.get(self.URL_4_GET_ALL_WORK_ID.format(userId=illuster_id)).text  # 获得所有的图片Id
    #         illust_id_json = str2json(illust_id_json)
    #         if self.isError(illust_id_json):
    #             self.logger.error("画师不存在!illuster_id是{}".format(illuster_id), exc_info=True)
    #             return False
    #         else:
    #             return True
    #     except Exception as e:
    #         self.logger.error("无法判断画师是否存在!illuster_id是{}".format(illuster_id), exc_info=True)
    #         return None

    def get_concerned_illuster_info(self, type_, current_page_num):
        url = 'https://www.pixiv.net/bookmark.php?type=user&rest={}&p={}'.format(
            type_, current_page_num)
        html = self.get(url).text
        ids = self.illuster_id_from_user.findall(html)
        profiles = self.illuster_profile_from_user.findall(html)
        usernames = self.illuster_username_from_user.findall(html)
        return ids, profiles, usernames

    def get_show_pagenum(self, html):
        page = self.pagenum_4_show.findall(html)
        return len(page)

    def get_hide_pagenum(self, html):
        page = self.pagenum_4_hide.findall(html)
        return len(page)

    def get_concerned_illusters_pagenum(self, type_: str):
        """获得关注的画家的大致信息
        type 为 hide或者show"""
        content = self.session.get(
            'https://www.pixiv.net/bookmark.php?type=user&rest={}'.format(
                type_),
            cookies={
                "cookies": self.cookie
            }).text
        page_num = 0
        if type_ == 'hide':
            page_num = self.get_hide_pagenum(content)
        elif type_ == 'show':
            page_num = self.get_show_pagenum(content)
        return page_num

    def get_pagenum_hide(self, html):
        page = self.pagenum_4_hide.findall(html)
        return len(page)

    def get_ugoira_info(self, illust_id):
        #  插画id 下载illust信息
        gif_info = str2json(
            self.session.get(self.UGOIRA_URL.format(illust_id=illust_id),
                             cookies={
                                 "cookies": self.cookie
                             }).text)
        # print(gif_info)
        delays = [item["delay"] for item in gif_info["body"]["frames"]]
        frames = {f['file']: f['delay'] for f in gif_info["body"]['frames']}
        page_num = len(delays)
        zip_url = gif_info["body"]["originalSrc"]
        return frames, page_num, zip_url, delays

    def download_ugoira(self, save_dir, illust_id, illuster_id, title, url,
                        restrict, x_restrict):
        try:
            frames, page_num, zip_url, delays = self.get_ugoira_info(illust_id)
            # self.insert_ugoira(illust_id, title, url, illuster_id, page_num=None,
            #                    restrict=restrict, x_restrict=x_restrict, status=WorkStatus.done.value)
            # page_num = self.spider_util.download_ugoira(illust_id,  dest, self.session)
            gif_path = os.path.join(save_dir, illust_id + ".gif")
            zip_path = os.path.join(save_dir, illust_id + ".zip")
            res = self.spider_util.download_ugoira(zip_url, zip_path,
                                                   self.headers)
            delays = [str(i) for i in delays]
            delays = ",".join(delays)
            if res:
                # 下载成功
                self.insert_ugoira(illust_id,
                                   title,
                                   url,
                                   illuster_id,
                                   page_num=page_num,
                                   restrict=restrict,
                                   x_restrict=x_restrict,
                                   status=WorkStatus.done.value,
                                   delays=delays)
            else:
                self.insert_ugoira(illust_id,
                                   title,
                                   url,
                                   illuster_id,
                                   page_num=page_num,
                                   restrict=restrict,
                                   x_restrict=x_restrict,
                                   status=WorkStatus.failure.value,
                                   delays=delays)
        except Exception:
            self.logger.error('下载动图时,遇到问题', exc_info=True)

    def insert_illust(self,
                      illust_id,
                      title=None,
                      url=None,
                      illuster_id=None,
                      page_num=None,
                      restrict=None,
                      x_restrict=None,
                      status=None):
        select_sql = 'SELECT id  FROM illust WHERE illust_id = %s '
        res = self.db_util.get_one(select_sql, illust_id)
        loc_url = str(illuster_id) + "/" + url.split('/')[-1]
        if not res:  # 数据库里没找到相关信息
            insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \
                         'x_restrict, loc_url)  VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
            self.db_util.insert(insert_sql,
                                (title, url, illust_id, illuster_id, page_num,
                                 IllustType.illust.value, status, restrict,
                                 x_restrict, loc_url))
        else:
            update_sql = 'UPDATE illust SET status = %s, page_no=%s, `restrict`=%s, x_restrict=%s, title=%s, ' \
                         'loc_url=%s, type=%s WHERE illust_id=%s'
            self.db_util.update(update_sql,
                                (status, page_num, restrict, x_restrict, title,
                                 loc_url, IllustType.illust.value, illust_id))

    def insert_ugoira(self,
                      illust_id,
                      title,
                      url,
                      illuster_id,
                      page_num,
                      restrict=None,
                      x_restrict=None,
                      status=None,
                      delays=None):
        select_sql = 'SELECT id  FROM illust WHERE illust_id = %s AND type = {}'.format(
            IllustType.ugoira.value)
        res = self.db_util.get_one(select_sql, illust_id)
        loc_url = str(illuster_id) + "/" + url.split('/')[-1]
        if not res:  # 数据库里没找到相关信息
            insert_sql = "INSERT INTO ugoira( ugoira_id, delays) VALUES (%s, %s)"
            self.db_util.insert(insert_sql, (illust_id, delays))
            insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \
                         'x_restrict, loc_url)  VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
            self.db_util.insert(insert_sql,
                                (title, url, illust_id, illuster_id, page_num,
                                 IllustType.ugoira.value, status, restrict,
                                 x_restrict, loc_url))
        else:
            update_sql = 'UPDATE illust SET status = %s, page_no=%s, `restrict`=%s, x_restrict=%s, title=%s, ' \
                         'loc_url=%s, type=%s WHERE illust_id=%s'
            self.db_util.update(
                update_sql, (WorkStatus.done.value, page_num, restrict,
                             x_restrict, title, loc_url, status, illust_id))

    def update_illust(self,
                      illust_id,
                      page_num,
                      illust_status=WorkStatus.done.value):
        # select_sql = 'SELECT status  FROM illust WHERE illust_id = %s '
        update_sql = 'UPDATE illust SET status = %s , page_no=%s WHERE illust_id = %s'
        self.db_util.update(update_sql, (illust_status, page_num, illust_id))
        # if self.db_util.get_one(select_sql, (illust_id)):
        #     update_sql = 'UPDATE illust SET status = %s, page_no = %s WHERE illust_id = %s'
        #     self.db_util.update(update_sql, (WorkStatus.done.value, page_num, illust_id))
        # else:
        #     self.logger.error("")
        # insert_sql = 'INSERT INTO illust(title, url, illust_id, illuster_id, page_no, `type`, status,`restrict`,' \
        #              'x_restrict)  VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
        # self.db_util.insert(insert_sql, (title, url, illust_id, illuster_id, page_num, illust_type,
        #                                      WorkStatus.waiting.value, restrict, x_restrict))

    def isError(self, json):
        return json['error']

    def download_manga(self, illust_id, illuster_id, title, img_url,
                       page_count, restrict, x_restrict):
        raise NotImplementedError
        pass

    def get(self, url):
        return self.session.get(url,
                                cookies={"cookies": self.cookie},
                                timeout=self.timeout)

    def get_illuster_info(self, illuster_id):
        # content = self.get(self.URL_4_GET_ILLUST_ID.format(userId=illuster_id)).text
        try:
            content = self.get(
                self.URL_4_GET_ILLUSTER_INFO.format(illuster_id)).text
            content_json = str2json(content)
            if self.isError(content_json):
                self.logger.error("画师不存在!illuster_id是{}".format(illuster_id),
                                  exc_info=True)
                return None
            name = content_json['body']['name']
            img_url = content_json['body']['imageBig']
            # print(content_json['extraData']['meta']['title'])
            # pat = re.compile('「(.*)」的个人资料 - pixiv')
            # # pat.search(content_json['extraData']['meta']['title'])
            # # pat = re.compile('「(.*)」的个人资料 - pixiv')
            # name = pat.findall(content_json['extraData']['meta']['title'])[0]
            return name, img_url
        except Exception:
            self.logger.error("无法判断画师是否存在!illuster_id是{}".format(illuster_id),
                              exc_info=True)
            return None
Exemplo n.º 4
0
class checker():
    def __init__(self, root_path):
        self.root_path = root_path
        self.SELECT_SQL = "SELECT page_no, loc_url,status FROM illust WHERE illust_id={}"
        self.RESET_ILLUSTER_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {}"
        self.RESET_ILLUST_SQL = "UPDATE illust SET status = 1 WHERE illuster_id = {} AND illust_id NOT IN ({})"
        self.FILTER_SQL = "SELECT illust_id FROM illust WHERE status = 10 AND illust_id in ({})"
        # 设priority = 6代表已经检查完毕
        self.MAKE_ILLUSTER_STATUS_DONE_SQL = "UPDATE illuster SET priority = 6 WHERE illuster_id = {}"
        self.CHECK_IF_DONE = 'SELECT priority FROM illuster WHERE illuster_id={}'
        self.GET_DONE_ILLUSTER = 'SELECT illuster_id FROM illuster WHERE priority=6'
        self.db_util = MysqlUtil()
        self.logger = Log(__name__, log_cate='checker').get_log()
        self.before_illuster_id = None

    def get_done_illuster(self):
        illuster_ids = self.db_util.get_all(self.GET_DONE_ILLUSTER)
        done_illuster_ids = [i[0] for i in illuster_ids]
        return done_illuster_ids

    def check_empty_dir(self, ignore=[]):
        try:
            ignore = ignore + self.get_done_illuster()
            for file_name in os.listdir(self.root_path):
                if file_name in ignore:
                    continue
                print(file_name)
                # path 即 illuster_id
                if self.before_illuster_id is not None:
                    # 将上一个设为完成
                    self.db_util.update(
                        self.MAKE_ILLUSTER_STATUS_DONE_SQL.format(
                            self.before_illuster_id))
                illuster_id = file_name
                self.before_illuster_id = illuster_id
                # 如果当前的illuster已经处理过了,就跳到下一个
                if self.db_util.get_one(
                        self.CHECK_IF_DONE.format(illuster_id))[0] == 6:
                    continue
                path = os.path.join(self.root_path, file_name)
                if os.path.isdir(path):
                    image_files = os.listdir(path)
                    if len(image_files) == 0:
                        print("no images in {}".format(path))
                        self.db_util.update(
                            self.RESET_ILLUSTER_SQL.format(illuster_id))
                        continue
        except Exception as e:
            self.logger.error('some problem happen', exc_info=1)
            raise e

    def check(self, ignore=[]):
        try:
            ignore = ignore + self.get_done_illuster()
            for file_name in os.listdir(self.root_path):
                if file_name in ignore:
                    continue
                print(file_name)
                # path 即 illuster_id
                if self.before_illuster_id is not None:
                    # 将上一个设为完成
                    self.db_util.update(
                        self.MAKE_ILLUSTER_STATUS_DONE_SQL.format(
                            self.before_illuster_id))
                illuster_id = file_name
                self.before_illuster_id = illuster_id
                if self.db_util.get_one(
                        self.CHECK_IF_DONE.format(illuster_id))[0] == 6:
                    continue
                path = os.path.join(self.root_path, file_name)
                if os.path.isdir(path):
                    image_files = os.listdir(path)
                    if len(image_files) == 0:
                        print("no images in {}".format(path))
                        self.db_util.update(
                            self.RESET_ILLUSTER_SQL.format(illuster_id))
                        continue
                    illust_ids = []
                    for file_ in image_files:
                        if file_.endswith('.zip'):
                            illust_id = file_.replace('.zip', '')
                        elif file_.endswith('.gif'):
                            # todo 暂不处理gif
                            continue
                        else:
                            illust_id = file_.split("_p")[0]
                        if illust_id not in illust_ids:
                            illust_ids.append(illust_id)
                    complete_illust_ids = []
                    # if not illust_ids:
                    #     continue
                    # illust_id_list = self.db_util.get_all(self.FILTER_SQL.format(','.join(illust_ids)))
                    # illust_ids = [i[0] for i in illust_id_list]
                    # if illust_ids:
                    #     continue
                    for illust_id in illust_ids:
                        res = True
                        info = self.db_util.get_one(
                            self.SELECT_SQL.format(illust_id))
                        # 如果是数据库中没有信息
                        if not info:
                            self.logger.info(
                                '数据库中没有信息,illust_id为{}'.format(illust_id))
                            print(illust_id)
                            continue
                        (page_no, loc_url, status) = info
                        if status < WorkStatus.done.value:  # 10是下载成功状态,小于10 表示不成功
                            continue
                        loc_url = str(loc_url, encoding='utf-8')
                        if 'gif' in loc_url:
                            # 暂时不处理gif,理论上。gif文件不会进入这里来
                            complete_illust_ids.append(illust_id)
                            continue
                        elif 'ugoira' in loc_url:
                            loc_url = loc_url.replace('_ugoira0.jpg', '.zip')
                            loc_url_temp = loc_url.replace(
                                '_ugoira0.png', '.zip')
                            if valid_file(
                                    os.path.join(self.root_path,
                                                 loc_url_temp)):
                                complete_illust_ids.append(illust_id)
                            continue
                        for i in range(page_no):
                            loc_url_temp = loc_url.replace(
                                "_p0", "_p{}".format(i))
                            res = res and valid_file(
                                os.path.join(self.root_path, loc_url_temp))
                        # 理论上只有完整的jpg,png到这边来
                        if res:
                            complete_illust_ids.append(illust_id)
                    if not complete_illust_ids:
                        # 全部设为未完成
                        self.db_util.update(
                            self.RESET_ILLUSTER_SQL.format(illuster_id))
                        # self.db_util.update(self.RESET_ILLUST_SQL.format(",".join(incomplete_illust_ids)))
                    else:
                        self.db_util.update(
                            self.RESET_ILLUST_SQL.format(
                                illuster_id, ','.join(complete_illust_ids)))
                    print(complete_illust_ids)
            # 把最后一个illuster_id进行处理
            if self.before_illuster_id is not None:
                self.db_util.update(
                    self.MAKE_ILLUSTER_STATUS_DONE_SQL.format(
                        self.before_illuster_id))
        except Exception as e:
            self.logger.error('some problem happen', exc_info=1)
            raise e