Exemplo n.º 1
0
def stop(blog_name):
    """
    停止博客
    :param blog_name:博客名称
    """
    db = dbm.DbManager()
    exist = db.find(model.Blog, model.Blog.name == blog_name)
    if exist:
        exist.status = 0
        db.add_data(exist)
        print('停止成功')
    else:
        print('%s不存在' % blog_name)
Exemplo n.º 2
0
def add(blog_name):
    """
    添加博客
    :param blog_name: 博客名称
    """
    db = dbm.DbManager()
    exist = db.find(model.Blog, model.Blog.name == blog_name)
    if exist:
        print('%s已经存在' % blog_name)
    else:
        data = model.Blog(name=blog_name, create_time=int(time.time()))
        db.add_data(data)
        log.info("%s添加完成" % blog_name)
        print('添加完成')
Exemplo n.º 3
0
def fix_md5_by_file(filename):
    print('检查文件%s' % filename)
    if not os.path.isfile(filename):
        return False
    md5_val = get_file_md5(filename)
    # print(os.path.split(filename)[-1])
    id_int = int(os.path.split(filename)[-1].split('.')[0])
    # print(id_int)
    db = dbm.DbManager()
    exist_md5 = db.session.query(model.Item).filter(
        model.Item.id == int(id_int), model.Item.status == 3).first()
    if exist_md5:
        if not exist_md5.md5:
            exist_md5.md5 = md5_val
            db.session.add(exist_md5)
            db.session.commit()
            print('[%s] 修改md5' % exist_md5.id)
    db.session.close()
Exemplo n.º 4
0
def show(blog_name=''):
    """
    展示博客信息
    :param blog_name:博客名称
    """
    db = dbm.DbManager()
    if blog_name == '':
        data = db.select(model.Blog, 1 == 1, model.Blog.update_time.desc())
    else:
        data = db.select(model.Blog, model.Blog.name == blog_name)
    if data:
        for one in data:
            if one.status == 1:
                status = '启动'
            else:
                status = '停止'
            update_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(one.update_time))
            print('%s [%s] 更新时间:%s' % (one.name, status, update_str))
    else:
        print('%s 不存在' % blog_name)
        log.error('%s 不存在' % blog_name)
Exemplo n.º 5
0
def update(blog_name, thread_num=10, log=None):
    # 获取博文总数
    total = get_total_post(blog_name, log)
    print(total)
    if not total:
        return False
    elif total == -1:
        return -1
    db = dbm.DbManager()
    session = db.get_session()
    blog_data = session.query(
        model.Blog).filter(model.Blog.name == blog_name).first()
    blog_data.total_post = total
    session.add(blog_data)
    session.commit()
    session.close()
    # print(total)
    # return False
    perpage = 10
    limit = total // perpage
    # print(thread_num)
    # return False
    # 创建多线程

    # 实例化线程锁
    lock = threading.Lock()
    if thread_num > (limit + 1):
        thread_num = limit + 1

    log.info('开始执行: 启动%s个线程下载%s个博文' % (thread_num, total))
    # 创建线程池
    pool = threadpool.ThreadPool(thread_num)
    requests_list = []
    for x in range(limit + 1):
        requests_list.append(
            ([blog_name, perpage, x + 1, lock, log, thread_num], None))
    requests_res = threadpool.makeRequests(catch_html, requests_list)
    [pool.putRequest(req) for req in requests_res]
    pool.wait()
    return True
Exemplo n.º 6
0
def download_img(one_data,
                 try_times=1,
                 log=None,
                 thread_num=0,
                 key=0,
                 lock=None):
    """
    实际下载方法,递归实现多次尝试
    :param key: 多个任务中的第key个,用于展示或日志
    :param thread_num:进程编号
    :param one_data: 需要下载的数据 字典类型
    :param try_times: 尝试次数,默认为1
    :param log: 日志对象
    :return:
    """
    # 绝对路径
    target_path = '/Volumes/hhd/python_download/tum/'
    target_path = os.path.join(
        target_path, 'download_' + time.strftime("%Y-%m-%d", time.localtime()))
    if not os.path.exists(target_path):
        os.mkdir(target_path)
    try:

        video_dir = os.path.join(target_path, 'video')
        pic_dir = os.path.join(target_path, 'pic')

        # this_dir = os.path.join(target_path, 'download_' + time.strftime("%Y-%m-%d", time.localtime()))

        if one_data['type'] == 1:
            # 根据不同类型设置过期时间
            time_limit = 30
            # 视频写死扩展
            ext = '.mp4'
            this_dir = os.path.join(video_dir, one_data['blog_name'])
            new_dir = os.path.join(
                this_dir, 'post_' +
                time.strftime("%Y-%m-%d", time.localtime(one_data['time'])))
        else:
            # 根据不同类型设置过期时间
            time_limit = 10
            # 动态获取扩展
            ext = os.path.splitext(one_data['url'])[1]
            this_dir = os.path.join(pic_dir, one_data['blog_name'])
            new_dir = os.path.join(
                this_dir, 'post_' +
                time.strftime("%Y-%m-%d", time.localtime(one_data['time'])))

        if not os.path.exists(new_dir):
            # 目录自动创建
            os.makedirs(new_dir)
        # 组装文件名称
        new_filename = os.path.join(new_dir, str(one_data['id']) + ext)
        # 获取开始下载时间

        url = one_data['url']
    except Exception as e:
        log.info('发生错误:%s url:%s' % (str(e), one_data['url']))
        return False
    try:
        proxies = {
            "http": "http://127.0.0.1:1087",
            "https": "https://127.0.0.1:1087",
        }
        r = requests.get(url, proxies=proxies, stream=True, timeout=time_limit)
        size = int(r.headers['Content-Length']) // 1024
        position = get_position(lock, log) + 1
        log.info('key:%s thread_num:%s postion:%s' %
                 (str(key), str(thread_num), str(position)))
    except Exception as e:
        # traceback.print_exc()
        log.info('发生错误:%s url:%s' % (str(e), one_data['url']))
        return False
    try:
        m = hashlib.md5()
        t = tqdm(iterable=r.iter_content(1024),
                 total=size,
                 unit='k',
                 desc='%d' % (key % thread_num),
                 position=position)
        with open(new_filename, 'wb') as f:
            for data in t:
                m.update(data)
                f.write(data)
        t.clear()
        t.close()
    except (http.client.IncompleteRead, socket.timeout) as ie:
        # 下载超时或不完整则重试
        if try_times > 3:
            log.error('id: %s 尝试次数过多 url:%s' %
                      (one_data['id'], one_data['url']))
            return False
        else:
            log.info('id: %s 获取不为完整,重试: %s' % (one_data['id'], str(ie)))
            return download_img(one_data, try_times + 1, log, thread_num, key)
    except Exception as e:
        log.info('发生错误:%s url:%s' % (str(e), one_data['url']))
        return False
    unset_position(position - 1, lock)
    # print(m.hexdigest())
    md5_val = m.hexdigest()
    # 查询是否存在相同文件
    db = dbm.DbManager()
    exist_md5 = db.session.query(model.Item).filter(
        model.Item.md5 == md5_val, model.Item.id != one_data['id']).first()
    if exist_md5:
        # print(exist_md5)
        log.info('%s md5重复:%s,删除文件:%s' %
                 (one_data['id'], exist_md5.id, new_filename))
        os.remove(new_filename)
    return md5_val
Exemplo n.º 7
0
def download(key, lock, log, progress, thread_num):
    """
    多线程,线程指定函数
    :param progress: 进度条对象
    :param thread_num: 进程编号
    :param key: 多个任务中的第key个,用于展示或日志
    :param lock: 锁,用于数据库的排他
    :param log: 日志对象,用于全局的日志记录
    :return:
    """
    start = time.time()
    log.info('开始下载 key:%s' % key)
    lock.acquire()
    try:
        db = dbm.DbManager()
        session = db.get_session()
        one_data = session.query(
            model.Item).filter(model.Item.status == 0).first()
        # print(one_data)
        # exit()
        if one_data:
            data_id = one_data.id
            data_url = one_data.url
            data_type = one_data.type
            data_name = one_data.blog_name
            data_time = one_data.post_time
            one_data.status = 1
            db.add_data(one_data)
            log.info('获取数据完成 key: %s id: %s' % (key, str(data_id)))
        else:
            log.info('获取数据失败 key: %s' % key)
            return False
    except Exception as e:
        # raise e
        # print(e)
        log.error('id: %s key: %s 发生错误: %s' %
                  (str(one_data.id), str(key), str(e)))
        return False
    finally:
        lock.release()

    download_data = {
        'id': data_id,
        'url': data_url,
        'blog_name': data_name,
        'type': data_type,
        'time': data_time
    }
    md5_val = download_img(download_data, 1, log, thread_num, key, lock)
    if not md5_val:
        db = dbm.DbManager()
        one_data = db.session.query(
            model.Item).filter(model.Item.id == data_id).first()
        one_data.status = 2
        db.add_data(one_data)
        log.info('下载失败 key:%s id: %s' % (key, data_id))
        return False
    else:
        db = dbm.DbManager()
        one_data = db.session.query(
            model.Item).filter(model.Item.id == data_id).first()
        one_data.status = 3
        one_data.md5 = md5_val
        db.add_data(one_data)
        end = time.time()
        log.info('下载完毕 key:%s 用时: %s秒' % (key, int(end - start)))
        # print('下载完毕 key:%s 用时: %s秒' % (key, int(end - start)))
        global finish_num
        finish_num = finish_num + 1
        # print(finish_num)
        progress.update(1)
        return True
Exemplo n.º 8
0
    os.mkdir('log')
log_file_name = '%s-%s.log' % (os.path.basename(__file__).replace(
    '.py', ''), datetime.date.today())
log_full_file_name = os.path.join('log', log_file_name)

log = getLogger()
rotateHandler = ConcurrentRotatingFileHandler(log_full_file_name, "a",
                                              512 * 1024, 0, 'utf-8')

datefmt_str = '%Y-%m-%d %H:%M:%S'
format_str = "[%(asctime)s - %(levelname)s - %(filename)s - LINE:%(lineno)d] %(message)s"
formatter = Formatter(format_str, datefmt_str)
rotateHandler.setFormatter(formatter)
log.addHandler(rotateHandler)
log.setLevel(INFO)
db = dbm.DbManager()


def main():
    """
    处理命令行参数,分发操作
    python3 one.py xxx all 强制更新
    python3 one.py xxx new 追加更新
    :return:
    """
    args = sys.argv
    enable_action = ['all', 'new']

    # if len(args) != 3:
    #     stop_and_log('error', '参数错误 args:%s' % str(args))
    #     return False
Exemplo n.º 9
0
def main():
    """
    处理命令行参数,分发操作
    python3 one.py xxx all 强制更新
    python3 one.py xxx new 追加更新
    :return:
    """
    args = sys.argv
    # 日志相关初始化
    log = mylog.get_log_object()
    db = dbm.DbManager()
    if len(args) == 2:
        blog_name = str(args[1])
        limit = 9
    elif len(args) == 3:
        blog_name = str(args[1])
        limit = int(args[2])
    elif len(args) == 1:
        session = db.get_session()
        one = session.query(model.Blog).filter(
            model.Blog.update_time == 0,
            model.Blog.status == 1).order_by(model.Blog.id.asc()).first()
        blog_name = one.name
        limit = 9
    else:
        stop_and_log('error', '参数错误 args:%s' % str(args), log)
        return False

    exist = db.find(model.Blog, model.Blog.name == blog_name)
    if not exist:
        stop_and_log('error', '%s 不存在' % blog_name, log)
        return False
    if exist.status == 0:
        stop_and_log('error', '%s 已停用' % blog_name, log)
        return False

    try:
        res_up = update(blog_name, limit, log)
    except Exception as e:
        print(e)
        return False
    if res_up == 1:
        try:
            session = db.get_session()
            exist.update_time = int(time.time())
            session.add(exist)
            session.commit()
            session.close()
        except Exception as e:
            print('%s %s' % (blog_name, str(e)))
        print('更新成功')
    elif res_up == 0:
        print('更新失败')
    elif res_up == -1:
        session = db.get_session()
        exist.update_time = int(time.time())
        exist.status = 0
        session.add(exist)
        session.commit()
        session.close()
        print('停止更新')
    else:
        print('异常 %s' % str(res_up))
    log.info('执行完毕')
Exemplo n.º 10
0
def catch_html(blog_name,
               perpage=20,
               page=1,
               lock=None,
               thread_log=None,
               thread_num=0):
    """
    抓取博客数据
    :param thread_num:
    :param thread_log: 
    :param lock:
    :param blog_name: 博客名
    :param perpage: 每页数据
    :param page: 页数
    :return: mixed
    """
    thread_log.info('开始抓取 %s 第%s页数据' % (blog_name, page))
    start = (page - 1) * perpage
    url = 'https://%s.tumblr.com/api/read/json?start=%s&num=%s' % (
        blog_name, start, perpage)
    # print(url)
    # 设置代理
    socket.setdefaulttimeout(20)
    proxy = "https://127.0.0.1:1087"
    proxy_handler = request.ProxyHandler({'https': proxy})
    opener = request.build_opener(proxy_handler)
    request.install_opener(opener)
    begin = time.time()
    try:
        content = request.urlopen(url).read().decode('UTF-8')
    except (http.client.IncompleteRead, socket.timeout) as ie:
        thread_log.info('url:%s 出错:%s 获取不完整,重试' % (url, str(ie)))
        return catch_html(blog_name, perpage, page, lock, thread_log,
                          thread_num)
    except error.HTTPError as e:
        if hasattr(e, 'code'):
            if e.code == 404:
                stop_and_log(
                    'error', '[%s 第%s页 获取失败] url:%s; %s' %
                    (blog_name, page, url, str(e)), thread_log)
                # 404 返回-1,方便停止改博客
                return -1
            else:
                thread_log.info('url:%s 出错:%s 获取获取失败,重试' % (url, str(e)))
                return catch_html(blog_name, perpage, page, lock, thread_log,
                                  thread_num)
        else:
            thread_log.info('url:%s 出错:%s 获取获取失败,重试' % (url, str(e)))
            return catch_html(blog_name, perpage, page, lock, thread_log,
                              thread_num)
    except Exception as e:
        thread_log.info('url:%s 出错:%s 获取获取失败,重试' % (url, str(e)))
        return catch_html(blog_name, perpage, page, lock, thread_log,
                          thread_num)
        # stop_and_log('error', '[%s 第%s页 获取失败] url:%s; %s' % (blog_name, page, url, str(e)))
        # return False
    # thread_log.info('%s 第%s页数据 开始整理数据格式' % (blog_name, page))
    # 修整数据格式
    try:
        content = content[22:len(content) - 2].replace("{'", '{"').replace(
            "'}", '"}')
        content = content.replace(",'", ',"')
    except Exception as e:
        stop_and_log(
            'error',
            '[%s 第%s页 修整数据格式出错] url:%s; %s' % (blog_name, page, url, str(e)),
            thread_log)
        return False

    # 转换为数据字典
    try:
        json_data = json.loads(content)
    except Exception as e:
        stop_and_log(
            'error', '[%s 第%s页 解析json数据失败] url:%s; %s;%s' %
            (blog_name, page, url, str(e), content), thread_log)
        return False
    # print(jsonData)
    posts = json_data['posts']
    post_list = []
    # 遍历获取博文信息
    for post in posts:
        post_type = post['type']
        unix_timestamp = post['unix-timestamp']
        # print(post['id'])
        post_id = post['id']
        item = {
            'id': post_id,
            'blog_name': blog_name,
            'post_type': post_type,
            'unix_timestamp': unix_timestamp,
            'img': '',
            'video': ''
        }
        # print(item)
        # continue
        if post_type == 'regular':
            pass
        elif post_type == 'photo':
            res_img = analysis_img(post, thread_log)
            if not res_img:
                thread_log.info('获取图片都失败: %s' % json.dumps(post))

            item['img'] = res_img
        elif post_type == 'video':
            res_video = analysis_video(post, thread_log)
            if not res_video:
                thread_log.info('获取视频都失败: %s' % json.dumps(post))
            item['video'] = res_video

        # print(item)
        post_list.append(item)
        # return False
    # lock.acquire()
    try:
        db = dbm.DbManager()
        session = db.get_session()
        for post in post_list:
            if post['post_type'] == 'photo':
                source_url = post['img']
                post_type = 0
            elif post['post_type'] == 'video':
                source_url = post['video']
                post_type = 1
            else:
                continue
            exist = session.query(
                model.Item).filter(model.Item.url == source_url).first()
            if not exist:
                data = model.Item(url=source_url,
                                  blog_name=post['blog_name'],
                                  type=post_type,
                                  create_time=int(time.time()),
                                  post_time=post['unix_timestamp'],
                                  post_id=post['id'])
                session.add(data)
                session.commit()
        session.close()
    except Exception as e:
        print(str(e))
        thread_log.info('[%s] 第%s页 插入数据库失败: %s' % (blog_name, page, str(e)))
        return catch_html(blog_name, perpage, page, lock, thread_log,
                          thread_num)
    finally:
        # lock.release()
        pass
    end = time.time()
    thread_log.info('[%s] 第%s页处理完毕 用时%s秒' %
                    (blog_name, page, str(int(end - begin))))
    return True