Пример #1
0
    def api(sql=None, method='first'):
        try:
            data = None
            for case in switch(method):
                if case('first'):
                    data = mysql_session.execute(sql).first()
                    if data is not None:
                        data = dict(data)
                    break
                if case('scalar'):
                    data = mysql_session.execute(sql).scalar()
                    break
                if case('fetchall'):
                    data = mysql_session.execute(sql).fetchall()
                    break
                if case('execute'):
                    data = mysql_session.execute(sql)
                    mysql_session.commit()
                    data = data.lastrowid
                    break

            # 是否打印日志
            if Config.MYSQL_DEBUG:
                service_logger.warn("sql:api", {"sql": sql})

            return data
        except Exception, err:
            mysql_session.rollback()
            service_logger.error("sql:error", {
                "sql": sql,
                "data": traceback.format_exc()
            })

            return None
Пример #2
0
def dytt_list(url=''):
    data = []

    if check_file(url, ext='.list'):
        html = read_file(url, ext='.list')
    else:
        html = get_url_html(url)
        write_file(url, html, ext='.list')

    if html == '':
        return data

    doc = pq(html)
    tables = doc('.co_content8 table').items()
    for tb in tables:
        txt = pq(tb)
        links = txt('.ulink').items()
        item = {}
        for link in links:
            href = pq(link).attr('href')
            if 'index.html' not in href:
                item['title'] = pq(link).text()
                item['link'] = 'http://www.ygdy8.net' + pq(link).attr('href')

        data.append(item)

    # 记录日志
    service_logger.warn(data=data)
    return data
Пример #3
0
def toutiao_list(url=''):
    data = []

    if check_file(url, ext='.list'):
        html = read_file(url, ext='.list')
    else:
        cookie = 'UM_distinctid=165e23b8bd863a-02b6bf44638b1e-541b371f-100200-165e23b8bd9812; tt_webid=6601789411817768455; WEATHER_CITY=%E5%8C%97%E4%BA%AC; uuid="w:be3b8ee49353488b825ded5ccbcf16b3"; CNZZDATA1259612802=1933104973-1537094542-%7C1539087142; __tasessionId=qgp2gufge1539087164145; csrftoken=afc50bb8fb759393b3c1da8340182cd6; tt_webid=6601789411817768455'
        html = get_url_html(url, cookie)
        write_file(url, html, ext='.list')

    #print html
    # 获取文章url
    resu = json.loads(html)
    if 'data' in resu:
        for vo in resu['data']:
            if 'item_source_url' in vo and 'media_avatar_url' in vo:
                if "http" not in vo['item_source_url'] and 'local//' not in vo[
                        'item_source_url']:
                    dt = {
                        'link':
                        'https://www.toutiao.com' + vo['item_source_url'],
                        'image': vo['media_avatar_url']
                    }
                    data.append(dt)
            elif 'source_url' in vo and 'image_url' in vo:
                if "http" not in vo['source_url'] and 'local//' not in vo[
                        'source_url']:
                    dt = {
                        'link': 'https://www.toutiao.com' + vo['source_url'],
                        'image': vo['image_url']
                    }
                    data.append(dt)
    # 记录日志
    service_logger.warn(data=data)
    return data
Пример #4
0
def tengxun_detail(url, links):
    print json.dumps(links)

    cate = []
    if 'tech' in url:
        cate = ['科技']
    elif 'finance' in url:
        cate = ['财经']
    elif 'edu' in url:
        cate = ['教育']
    elif 'house' in url:
        cate = ['房产']
    elif 'visit' in url:
        cate = ['旅游']
    elif 'internet' in url or 'tcctit' in url or 'ai' in url:
        cate = ["互联网"]

    if len(links) > 0:
        for vo in links:
            # todo 检查链接
            if ImportService.check_url(vo['link']):
                continue

            # 延时抓取
            tm = random.randint(4, 10)
            time.sleep(tm)

            try:
                page = Tengxun(vo['link'])
                # 补全数据
                page.set_category(cate)
                data = page.get_content()
                if vo['image'] != '':
                    data['image'] = vo['image']
                # 如果图示:开头要加http
                if data['image'] != '' and data['image'][0:2] == '//':
                    data['image'] = 'http:' + data['image']

                # 记录日志
                service_logger.warn(data=data)
                if data['send_time'] == '' or data['title'] == '':
                    continue

                # todo 保存数据
                ImportService.insert_handle(data)
                # break
            except Exception, err:
                service_logger.error("tengxun-exception", {
                    "msg": traceback.format_exc(),
                    "link": vo['link']
                })

            # 删除文件
            delete_file(vo['link'])

        # 删除列表
        delete_file(url, ext='.list')
Пример #5
0
def toutiao_detail(url, links):
    print json.dumps(links)

    cate = []
    if 'news_baby' in url:
        cate = ['教育']
    elif 'news_travel' in url:
        cate = ['旅游']
    elif '人工智能' in url or '大数据' in url:
        cate = ['技术']

    if len(links) > 0:
        for vo in links:
            # todo 检查链接
            if ImportService.check_url(vo['link']):
                continue

            # 延时抓取
            tm = random.randint(4, 10)
            time.sleep(tm)

            try:
                page = Toutiao(vo['link'])
                # 补全数据
                if len(cate) > 0:
                    page.set_category(cate)

                data = page.get_content()
                if vo['image'] != '':
                    data['image'] = vo['image']
                # 如果图示:开头要加http
                if data['image'] != '' and data['image'][0:2] == '//':
                    data['image'] = 'http:' + data['image']

                # 记录日志
                service_logger.warn(data=data)
                if data['send_time'] == '' or data['title'] == '':
                    continue

                # todo 保存数据
                ImportService.insert_handle(data)
                # break
            except Exception, err:
                service_logger.error("toutiao-exception", {
                    "msg": traceback.format_exc(),
                    "link": vo['link']
                })

            # 删除文件
            delete_file(vo['link'])

        # 删除列表
        delete_file(url, ext='.list')
Пример #6
0
def dytt_detail(url, links):
    print json.dumps(links)

    cate = []
    if 'jddy' in url:
        cate = ['综合电影']
    elif 'oumei' in url:
        cate = ['欧美电影']
    elif 'china' in url:
        cate = ['国内电影']
    elif 'rihan' in url:
        cate = ['日韩电影']
    elif 'dyzz' in url:
        cate = ['最新电影']

    if len(links) > 0:
        for vo in links:
            print vo['link']
            # todo 检查链接
            if ImportService.check_url(vo['link']):
                continue

            # 延时抓取
            tm = random.randint(4, 10)
            time.sleep(tm)

            try:
                page = Dytt(vo['link'])
                # 补全数据
                page.set_category(cate)

                data = page.get_content(flag=False)
                # 记录日志
                service_logger.warn(data=data)
                if data['send_time'] == '' or data['title'] == '':
                    continue

                # todo 保存数据
                ImportService.insert_handle(data, 'video')
                # break

            except Exception, err:
                service_logger.error("dytt-exception", {
                    "msg": traceback.format_exc(),
                    "link": vo['link']
                })

            # 删除文件
            delete_file(vo['link'])

        # 删除列表
        delete_file(url, ext='.list')
Пример #7
0
def tengxun_list(url=''):
    data = []

    if check_file(url, ext='.list'):
        html = read_file(url, ext='.list')
    else:
        html = get_url_html(url)
        html = unicode(html, 'GBK').encode('UTF-8')
        write_file(url, html, ext='.list')

    res = re.findall('window.chData={(.*?)};', html, re.S)
    if len(res) > 0:
        str = '{' + res[0] + '}'
        arrs = json.loads(str)
        for vo in arrs['data']:
            dt = {'link': vo['url'], 'image': vo['img']}
            data.append(dt)
    # 记录日志
    service_logger.warn(data=data)
    return data
Пример #8
0
class ImportService():
    @staticmethod
    def check_url(url):
        res = PostsModel.check(url)
        if res:
            return True

        return False

    @staticmethod
    def get_douban_image(name, w=480, h=320):
        image = ''
        url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=' + name
        html = get_url_html(url)

        doc = pq(html)
        tables = doc('.c-container').items()
        i = 0
        for tb in tables:
            i = i + 1
            txt = pq(tb)
            title = txt.text()
            imgObj = txt('img')
            if name in title:
                image = imgObj.attr('src')
                break
            if i > 8:
                break

        if image != '':
            service_logger.log('百度搜索图片:' + image)
            image = ImportService.upload_image(image, iscut=False, w=w, h=h)

        return image

    @staticmethod
    def upload_image(image, iscut=False, w=300, h=200):
        if image == '':
            return

        file = time.strftime("%Y%m%d%H%M%S", time.localtime()) + '_' + str(
            random.randint(10000, 99999))
        subs = image.split('/')[-1]
        exts = subs.split('.')
        ext = 'jpg'
        if len(exts) > 1:
            ext = exts[-1]

        filename = file + '.' + ext
        y = time.strftime("%Y", time.localtime())
        m = time.strftime("%m", time.localtime())
        filepath = Config.IMAGE_PATH + '/' + y + '/' + m
        if os.path.isdir(filepath) == False:
            os.makedirs(filepath, 0775)

        newfile = filepath + '/' + filename
        oldfile = Config.DIR_PATH + filename

        try:
            # 存储原图
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"
            }
            response = requests.get(image, headers=headers)
            if response.status_code != 200:
                return ''
            if '<!DOCTYPE' in response.content or '<iframe' in response.content:
                return ''
            cat_img = response.content
            with open(oldfile, "wb") as f:
                f.write(cat_img)
        except Exception, err:
            service_logger.error("task-exception", {
                "msg": traceback.format_exc(),
                "image": image
            })
            return ''

        if iscut:
            # 存储裁剪图
            with open(oldfile, 'rb') as f:
                with Image.open(f) as img:
                    print img.size
                    if img.size[0] > w or img.size[1] > h:
                        cover = resizeimage.resize_cover(img, [w, h])
                        cover.save(newfile, img.format)
                    else:
                        with open(newfile, 'wb') as fo:
                            fo.write(cat_img)
        else:
            # 存储新的图片
            with open(newfile, 'wb') as f:
                f.write(cat_img)

        # 图片日志
        service_logger.warn(data={
            "image": image,
            'old': oldfile,
            'new': newfile
        })

        # 删除图片
        os.remove(oldfile)

        return y + '/' + m + '/' + filename