Exemplo n.º 1
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False  # 是否做完一轮
        is_all_done = False  # 是否全部做完(所有公众号当日的发布的信息均已采集)

        if WechatAction._todo_urls:
            url = WechatAction._todo_urls.popleft()
        else:
            # 做完一个公众号 更新其文章数
            WechatAction._wechat_service.update_account_article_num(
                WechatAction._current_account_biz)

            # 跳转到下一个公众号
            account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account(
            )
            WechatAction._account_info[__biz] = account_id or ''

            # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz
            url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz
            log.debug('''
                下一个公众号 : %s
                ''' % url)

        # 注入js脚本实现自动跳转
        if is_all_done:  # 当天文章均已爬取 下一天再爬
            # 睡眠到下一天
            sleep_time = self.get_next_day_time_interval()
        elif is_done:  # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date(
        ) < tools.get_current_date(
                "%Y-%m-%d"
        ) + ' ' + SPIDER_START_TIME:  # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else:  # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        log.debug('''
            next_page_url : %s
            is_done:        %s
            is_all_done:    %s
            sleep_time:     %s
            next_start_time %s
            ''' % (url, is_done, is_all_done,
                   tools.seconds_to_h_m_s(sleep_time / 1000),
                   tools.timestamp_to_date(tools.get_current_timestamp() +
                                           sleep_time / 1000)))
        next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % (
            url, sleep_time)
        return next_page
Exemplo n.º 2
0
    def sign_account_is_zombie(self, __biz, last_publish_time=None):
        if last_publish_time:
            sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                last_publish_time, tools.get_current_date(), __biz)
        else:
            sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                tools.get_current_date(), __biz)

        self._mysqldb.update(sql)
Exemplo n.º 3
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False # 是否做完一轮
        url = None

        while WechatAction._todo_urls:
            result = WechatAction._todo_urls.popleft()
            if callable(result): # 为更新公众号已做完的回调
                result() #执行回调
            else:
                url = result
                break

        if not url:
            # 跳转到下一个公众号
            account = WechatAction._wechat_service.get_next_account()
            if account:
                account_id, __biz = account
                WechatAction._account_info[__biz] = account_id or ''

                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz
                log.debug('''
                    下一个公众号 : %s
                    '''%url)
            else:
                is_done = True

        # 注入js脚本实现自动跳转
        if is_done: # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else: # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000)
        tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000)
        if not url:
            url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time)

        log.debug('''
            next_page_url : %s
            is_done:        %s
            sleep_time:     %s
            next_start_time %s
            '''%(url, is_done, tip_sleep_time, tip_next_start_time))
        next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time)
        return next_page
Exemplo n.º 4
0
    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)
Exemplo n.º 5
0
        def parse_article_info(article_info, release_time):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))
            title = article_info.get('title')
            summary = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace(
                'amp;', '')
            source_url = article_info.get('source_url').replace('\\',
                                                                '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            author = article_info.get('author')
            if url and url.startswith(
                    'http://mp.weixin.qq.com/'
            ):  # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库
                mid = tools.get_param(url, 'mid') or tools.get_param(
                    url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
                idx = tools.get_param(url, 'idx') or tools.get_param(
                    url, 'itemidx')  # 第几条图文消息 从1开始
                article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

                # 判断该文章库中是否已存在
                if WechatAction._wechat_service.is_exist(
                        'wechat_article',
                        article_id) or (ONLY_TODAY_MSG and release_time <
                                        tools.get_current_date('%Y-%m-%d')):
                    self._is_need_get_more = False
                    return  # 不往下进行 舍弃之后的文章

                __biz = tools.get_param(url, '__biz')  # 用于关联公众号

                # 缓存文章信息
                WechatAction._article_info[article_id] = {
                    'article_id': int(article_id),
                    'title': title,
                    'summary': summary,
                    'release_time': release_time,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'account': '',
                    'author': author,
                    '__biz': __biz,
                    'read_num': None,
                    'like_num': None,
                    'content': '',
                    'comment': [],
                    'record_time': tools.get_current_date()
                }

                # 将文章url添加到待抓取队列
                WechatAction._todo_urls.append(url)
Exemplo n.º 6
0
    def __parse_account_info(self, data, req_url):

        __biz = tools.get_param(req_url, "__biz")

        regex = 'id="nickname">(.*?)</strong>'
        account = tools.get_info(data, regex, fetch_one=True).strip()

        regex = 'profile_avatar">.*?<img src="(.*?)"'
        head_url = tools.get_info(data, regex, fetch_one=True)

        regex = 'class="profile_desc">(.*?)</p>'
        summary = tools.get_info(data, regex, fetch_one=True).strip()

        # 认证信息(关注的账号直接点击查看历史消息,无认证信息)
        regex = '<i class="icon_verify success">.*?</i>(.*?)</span>'
        verify = tools.get_info(data, regex, fetch_one=True)
        verify = verify.strip() if verify else ""

        # 二维码
        regex = 'var username = "" \|\| "(.*?)";'  # ||  需要转译
        qr_code = tools.get_info(data, regex, fetch_one=True)
        qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz,
            "account": account,
            "head_url": head_url,
            "summary": summary,
            "qr_code": qr_code,
            "verify": verify,
            "spider_time": tools.get_current_date(),
        }

        if account_data:
            data_pipeline.save_account(account_data)
Exemplo n.º 7
0
    def check_new_article(self, account):
        oralce_id, account_id, account_name, last_article_release_time, biz = account

        article_release_time = self._wechat_sogo.get_article_release_time(
            account_id=account_id, account=account_name)
        print(article_release_time)
        if article_release_time:
            last_article_release_time = last_article_release_time or ''
            if article_release_time >= tools.get_current_date(
                    '%Y-%m-%d'
            ) and article_release_time > last_article_release_time:
                print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name,
                                                       article_release_time))

                sql = '''
                    update TAB_IOPM_SITE t set t.spider_status = 601,
                     t.last_article_release_time =
                           to_date('{}', 'yyyy-mm-dd hh24:mi:ss')
                     where id = {}
                '''.format(article_release_time, oralce_id)

                # 多线程, 数据库需每个线程持有一个
                oracledb = OracleDB()
                oracledb.update(sql)
                oracledb.close()

                # 入redis, 作为微信爬虫的任务池
                data = (oralce_id, account_id, account_name,
                        last_article_release_time, biz)
                self._redisdb.sadd('wechat:account', data)
Exemplo n.º 8
0
def add_website_info(table, site_id, url, name, domain = '', ip = '', address = '', video_license = '', public_safety = '', icp = ''):
    '''
    @summary: 添加网站信息
    ---------
    @param table: 表名
    @param site_id: 网站id
    @param url: 网址
    @param name: 网站名
    @param domain: 域名
    @param ip: 服务器ip
    @param address: 服务器地址
    @param video_license: 网络视听许可证|
    @param public_safety: 公安备案号
    @param icp: ICP号
    ---------
    @result:
    '''

    # 用程序获取domain,ip,address,video_license,public_safety,icp 等信息
    domain = tools.get_domain(url)

    site_info = {
        'site_id':site_id,
        'name':name,
        'domain':domain,
        'url':url,
        'ip':ip,
        'address':address,
        'video_license':video_license,
        'public_safety':public_safety,
        'icp':icp,
        'read_status':0,
        'record_time': tools.get_current_date()
    }
    mongodb.add(table, site_info)
Exemplo n.º 9
0
    def is_have_new_article(self, account_id='', account=''):
        '''
        @summary: 检查公众号今日是否发文
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''

        account_block = self.__get_account_blocks(account_id, account)
        if account_block == constance.VERIFICATION_CODE:
            return constance.VERIFICATION_CODE

        regex = "timeConvert\('(\d*?)'\)"
        release_time = tools.get_info(account_block, regex, fetch_one=True)

        if release_time:
            release_time = int(release_time)
            release_time = tools.timestamp_to_date(release_time)
            log.debug("最近发文时间 %s" % release_time)

            if release_time >= tools.get_current_date('%Y-%m-%d'):
                return constance.UPDATE
            else:
                return constance.NOT_UPDATE

        else:
            return constance.ERROR
Exemplo n.º 10
0
    def get_article(self):
        '''
        @summary: 目前取的是record_time 为了保证有数据, 正常应该取releast_time TODO
        ---------
        ---------
        @result:
        '''

        per_record_time = self.get_per_record_time()

        today_time = tools.get_current_date('%Y-%m-%d')
        if per_record_time:
            sql = "select * from {table} where record_time > '{record_time}' and release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format(
                table=self._table,
                record_time=per_record_time,
                today_time=today_time)
        else:
            sql = "select * from {table} where release_time >= '{today_time} 00:00:00' and release_time <= '{today_time} 23:59:59' order by record_time".format(
                table=self._table, today_time=today_time)

        url = 'http://{address}/_sql?sql={sql}'.format(address=ADDRESS,
                                                       sql=sql)
        log.debug(url)

        article = tools.get_json_by_requests(url)
        return article.get('hits', {}).get('hits', [])
    def image_predict(self, image_url):
        if not image_url:
            return -1
        # 如果是网络图片 先下载 识别 然后删除
        if image_url.startswith('http'):
            local_image_path = TEMP_IMAGE_SAVE_PATH + tools.get_current_date(
                date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            is_success = tools.download_file(image_url, local_image_path)
            image_url = local_image_path if is_success else image_url
            result = ImagePornRecg.__image_porn_dll.Pic_Predict(
                image_url, self._porn_image_index)
            tools.del_file(local_image_path)

        # 如果是本地图片 不是jpg格式 则需要转成jpg格式
        elif not image_url.endswith('jpg'):
            jpg_image_url = image_url[:image_url.rfind('.')] + '.jpg'
            is_success = ffmpeg_manager.convert_file_format(
                image_url, jpg_image_url)
            image_url = jpg_image_url if is_success else image_url
            result = ImagePornRecg.__image_porn_dll.Pic_Predict(
                image_url, self._porn_image_index)

        else:
            result = ImagePornRecg.__image_porn_dll.Pic_Predict(
                image_url, self._porn_image_index)
        return result
Exemplo n.º 12
0
def get_release_time(mblog):
    try:
        release_time = mblog['created_at']
        data = tools.time.time()
        ltime = tools.time.localtime(data)
        timeStr = tools.time.strftime("%Y-%m-%d", ltime)
        if tools.re.compile('今天').findall(release_time):
            release_time = release_time.replace('今天', '%s' % timeStr)
        elif tools.re.compile('昨天').findall(release_time):
            today = datetime.date.today()
            yesterday = today - datetime.timedelta(days=1)
            release_time = release_time.replace('昨天', '%s' % yesterday)
        elif '小时前' in release_time:
            nhours = tools.re.compile('(\d+)小时前').findall(release_time)
            hours_ago = (tools.datetime.datetime.now() -
                         tools.datetime.timedelta(hours=int(nhours[0])))
            release_time = hours_ago.strftime("%Y-%m-%d %H:%M")
        elif tools.re.compile('分钟前').findall(release_time):
            nminutes = tools.re.compile('(\d+)分钟前').findall(release_time)
            minutes_ago = (tools.datetime.datetime.now() -
                           tools.datetime.timedelta(minutes=int(nminutes[0])))
            release_time = minutes_ago.strftime("%Y-%m-%d %H:%M")
        elif tools.re.compile('刚刚').findall(release_time):
            release_time = tools.get_current_date()
        else:
            if len(release_time) < 10:
                release_time = '%s-%s' % (timeStr[0:4], release_time)
    except:
        release_time = ''
    finally:
        return release_time
Exemplo n.º 13
0
def add_net_program(rank, rank_wave, url, name, video_id, image_url,
                    mini_summary, episode_msg, today_play_count,
                    total_play_count, director, classify, institution,
                    release_year, description, actor, score, video_type,
                    net_source):
    '''
    @summary:
    ---------
    @param rank:
    @param rank_wave:
    @param url:
    @param name:
    @param video_id:
    @param image_url:
    @param mini_summary:
    @param episode_msg:
    @param today_play_count:
    @param total_play_count:
    @param director:
    @param classify:
    @param institution:
    @param release_year:
    @param description:
    @param actor:
    @param score:
    @param type: 节目类型  电影 1 电视剧 2 综艺等
    @param net_source: 来源 爱奇艺
    ---------
    @result:
    '''

    program = {
        'rank': rank,
        'rank_wave': rank_wave,
        'url': url,
        'program_name': name,
        'image_url': image_url,
        'keywords': mini_summary,
        'episode': episode_msg,
        'play_count_total': today_play_count,
        'total_play_count': total_play_count,
        'director': director,
        'classify': classify,
        'institution': institution,
        'release_year': release_year,
        'description': description,
        'actor': actor,
        'score': score,
        'type': video_type,
        'net_source': net_source,
        'record_time': tools.get_current_date(),
        'is_setmenu': 0,
        'baidu_score': None,
        'up_count': None,
        'collect': 0,
        'sensitive': 0,
        'program_id': video_id
    }

    es.add('tab_mms_net_program', program, video_id)
def add_wechat_account_info(table, site_id, name, account_id, account_url,
                            image_url, local_image_url, article_count, summary,
                            certification, is_verified, barcode_url,
                            local_barcode_url):
    account_info = {
        'name': name,
        'account_id': account_id,
        'account_url': account_url,
        'image_url': image_url,
        'local_image_url': local_image_url,
        'article_count': article_count,
        'summary': summary,
        'certification': certification,
        'is_verified': is_verified,
        'barcode_url': barcode_url,
        'local_barcode_url': local_barcode_url,
        'read_status': 0,
        'record_time': tools.get_current_date(),
        'sexy_image_url': local_image_url,
        'sexy_image_status': '',
        'image_pron_status': 0
    }

    if not db.add(table, account_info):
        account_info.pop('_id')
        account_info.pop('image_pron_status')
        account_info.pop('sexy_image_status')
        account_info.pop('sexy_image_url')
        db.update(table,
                  old_value={'account_id': account_id},
                  new_value=account_info)
Exemplo n.º 15
0
def save_video_info(release_time='',
                    content='',
                    url='',
                    author='',
                    title='',
                    image_url='',
                    site_name='',
                    play_count=None,
                    comment_count=None,
                    praise_count=None,
                    summary='',
                    time_length=None):
    domain = tools.get_domain(url)
    content_info = {
        'domain': domain,
        'uuid': tools.get_uuid(title, domain),
        'site_name': site_name,
        'image_url': image_url,
        'title': title,
        'author': author,
        'url': url,
        'content': content,
        'release_time': tools.format_date(release_time),
        'play_count': play_count,
        'comment_count': comment_count,
        'praise_count': praise_count,
        'time_length': time_length,
        'record_time': tools.get_current_date(),
        'summary': summary
    }
    log.debug(tools.dumps_json(content_info))

    es.add('video_news', content_info, content_info['uuid'])
Exemplo n.º 16
0
    def set_cookie_un_available(self, cookie):
        '''
        @summary: 设置cookie不可用
        ---------
        @param cookie:(id. cookie, un_available_times)
        ---------
        @result:
        '''
        if not cookie: return

        try:
            # 从列表中移除
            self._cookies.remove(cookie)

            # 更新数据库
            sql = '''
                update sogou_cookies set
                  is_available = 0,
                  un_available_time = '%s',
                  un_available_times = un_available_times + 1
                where id = %d
                '''%(tools.get_current_date(), cookie[0])

            self._sqlite3db.update(sql)

        except Exception as e:
            log.error(e)
Exemplo n.º 17
0
def add_url(table,
            site_id='',
            url='',
            depth=0,
            remark='',
            status=Constance.TODO,
            title='',
            origin='',
            domain='',
            retrieval_layer=0,
            image_url='',
            release_time=''):
    url_dict = {
        'site_id': site_id,
        'url': url,
        'depth': depth,
        'remark': remark,
        'status': status,
        'title': title,
        'origin': origin,
        'release_time': release_time,
        'domain': domain,
        'record_time': tools.get_current_date(),
        'image_url': image_url,
        'retrieval_layer': retrieval_layer
    }
    return db.add(table, url_dict)
def add_wp_content_episode_info(table,
                                title='',
                                image_url='',
                                video_url='',
                                watched_count='',
                                play_length='',
                                comments_count='',
                                release_time='',
                                content_id='',
                                data_type=''):
    wp_content_episode_info_dict = {
        'content_id': content_id,
        'image_url': image_url,
        'title': title,
        'video_url': video_url,
        'watched_count': watched_count,
        'play_length': play_length,
        'comment_count': comments_count,
        'release_time': release_time,
        'image_pron_status': 0,
        'record_time': tools.get_current_date(),
        'data_type': data_type,
        'read_status': 0
    }
    db.add(table, wp_content_episode_info_dict)
def add_appsite_info(table,
                     site_id,
                     url,
                     name,
                     app_url='',
                     summary='',
                     update_info='',
                     author='',
                     image_url='',
                     classify='',
                     size='',
                     tag='',
                     platform='android',
                     download_count='',
                     release_time=''):
    '''
    @summary: 添加app 网站信息
    ---------
    @param table: 表名
    @param site_id: 网站id
    @param url: 网址
    @param name: app名
    @param app_url: app url
    @param summary: 简介
    @param update_info: 更新信息
    @param author: 开发者
    @param image_url: 图标url
    @param classify: 分类
    @param size: 大小
    @param tag: 版本
    @param platform: 平台 默认android
    @param download_count: 下载次数
    @param release_time: 发布时间
    ---------
    @result:
    '''

    app_info = {
        'site_id': site_id,
        'url': url,
        'name': name,
        'app_url': app_url,
        'summary': summary,
        'update_info': update_info,
        'author': author,
        'image_url': image_url,
        'classify': classify,
        'size': size,
        'tag': tag,
        'platform': platform,
        'download_count': download_count,
        'release_time': release_time,
        'read_status': 0,
        'record_time': tools.get_current_date(),
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0
    }

    db.add(table, app_info)
Exemplo n.º 20
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \
                  % (keyword, page_index)
            info_json = tools.get_json_by_requests(url)
            video_info_list = info_json['list']
            if not video_info_list:
                print(url)
                break
            for video_info in video_info_list:
                image_url = video_info['thumburl']
                title = tools.del_html_tag(video_info['videoname'])
                url = video_info['url']
                release_time = video_info['showtime']
                current_date = tools.get_current_date('%Y-%m-%d')
                if current_date > release_time:
                    next_keyword = True
                    break
                base_parser.save_video_info(image_url=image_url,
                                            url=url,
                                            title=title,
                                            release_time=release_time,
                                            site_name=NAME)
            if next_keyword:
                break
Exemplo n.º 21
0
    def deal_comment(self, req_url, text):
        """
        解析评论
        :param req_url:
        :param text:
        :return:
        """

        data = tools.get_json(text)

        __biz = tools.get_param(req_url, "__biz")

        comment_id = tools.get_param(req_url, "comment_id")  # 与文章关联
        elected_comment = data.get("elected_comment", [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get("nick_name"),
                logo_url=comment.get("logo_url"),
                content=comment.get("content"),
                create_time=tools.timestamp_to_date(
                    comment.get("create_time")),
                content_id=comment.get("content_id"),
                like_num=comment.get("like_num"),
                is_top=comment.get("is_top"),
                spider_time=tools.get_current_date(),
            ) for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Exemplo n.º 22
0
    def deal_comment(self, req_url, text):
        data = tools.get_json(text)

        __biz = tools.get_param(req_url, '__biz')

        comment_id = tools.get_param(req_url, 'comment_id')  # 与文章关联
        elected_comment = data.get('elected_comment', [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get('nick_name'),
                logo_url=comment.get('logo_url'),
                content=comment.get('content'),
                create_time=tools.timestamp_to_date(comment.get('create_time')),
                content_id=comment.get('content_id'),
                like_num=comment.get('like_num'),
                is_top=comment.get('is_top'),
                spider_time=tools.get_current_date()
            )
            for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Exemplo n.º 23
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        ).extract_first(default="")
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)
        biz = tools.get_param(req_url, "__biz")

        text = remove_tags(content).strip()
        spider_name = 'wechat'
        collection_mode = 'spider'
        data_source_type = '微信公众号'

        article_data = {
            "data_type": account,
            "title": title,
            "data_address": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "text": text,
            "spider_name": spider_name,
            "collection_mode": collection_mode,
            "data_source_type": data_source_type,
            "sn": sn,
            "collection_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
def add_va_app_content_info(table, site_id, title, summary, image_url,
                            img_stor_path, url, release_time, video_url,
                            video_stor_path, content, column_id, is_download,
                            sensitive_id, violate_id, storage_id):
    '''
    @summary:
    ---------
    @param table:
    @param site_id:
    @param title:
    @param summary:
    @param image_url:
    @param img_stor_path:
    @param url:
    @param release_time:
    @param video_url:
    @param video_stor_path:
    @param content:
    @param column_id:
    @param is_download:
    @param sensitive_id:
    @param violate_id:
    ---------
    @result:
    '''

    is_audio = video_url and 1 or 0

    content_info_dict = {
        'title': title,
        'summary': summary,
        'image_url': image_url,
        'img_stor_path': img_stor_path,
        'url': url,
        'release_time': release_time,
        'video_url': video_url,
        'video_stor_path': video_stor_path,
        'content': content,
        'column_id': column_id,
        'is_download': is_download,
        'sensitive_id': sensitive_id,
        'violate_id': violate_id,
        'storage_id': storage_id,
        'site_id': site_id,
        'record_time': tools.get_current_date(),
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0,
        'read_status': 0,
        'is_audio': is_audio
    }

    db.add(table, content_info_dict)

    if sensitive_id or violate_id:
        content_info_dict['content_id'] = content_info_dict['_id']
        db.add('VAApp_vioation_content_info', content_info_dict)
Exemplo n.º 25
0
def save_es():

    while True:
        content_infos = db.find('TIANJIN_APP_content_info',
                                {'es_read_status': 0},
                                limit=40000)
        #print(content_infos)
        if not content_infos:
            break
        for content_info in content_infos:
            mongo_id = content_info['_id']
            mongo_id = int(str(mongo_id)[-6:], 16)
            uuid = str(mongo_id) + '_4'
            # site_name = content_info['site_name']
            if not content_info['release_time']:
                content_info['release_time'] = tools.get_current_date()
            site_find_date = '2018-09-23 00:00:00'
            es_content_info = {
                'ID': mongo_id,
                'UUID': uuid,
                'ARTICLE_URL': content_info['url'],
                'FIND_DATE': content_info['record_time'],
                'PRAISE_COUNT': 0,
                # 'IMAGE_CODE': 5,
                'IMAGE_CODE': content_info['image_pron_status'],
                'RELEASE_TIME': content_info['release_time'],
                'CONTENT': content_info['content'],
                'IMAGE_URL': content_info['image_url'],
                'SOURCE_ID': content_info['site_id'],
                'SITE_FIND_DATE': site_find_date,
                'COMMENT_COUNT': 0,
                'SOURCE_NAME': content_info['site_name'],
                'OUT_CHAIN_STATUS': 1,
                'NAME': content_info['title'],
                'TRANSPOND_COUNT': 0,
                'TYPE_ID': '4',  # 1持证网站 2备案网站 3无证网站 4 APP 5微博 6微信 7OTT
                'TYPE_NAME': 'APP',
                'TASK_ID': '',
                'VIOLATE_LIBRARY': '',
                'ADDVIOLATE_DATE': None,
                'READ': 1,
                'CHECK_STATUS': 1,
                'VIOLATE_STATUS': 1,
                'MATERIAL_CHECK_VIOLATE_TYPE': '',
                'VIOLATE_CHECK_VIOLATE_TYPE': '',
                'FIELD_STR1': '',
                'FIELD_STR2': '',
                'MATERIAL_LIBRARY': '',
                'ADDMATERIAL_DATE': None,
            }
            # print(es_content_info['UUID'])
            es.add('tab_iimp_all_program_info', es_content_info,
                   uuid)  #{"es_read_status": "1"}
            # es.add_batch(es_content_info, uuid,'tab_iimp_all_program_info')
            # info.update({"_id":content_info['mongo_id']},{"$set": {"es_read_status": "1"}})
            db.update('TIANJIN_APP_content_info', {"_id": content_info['_id']},
                      {"es_read_status": 1})
        def export_callback(execute_type, sql, data_json):
            if execute_type != ExportData.EXCEPTION:
                infoIds = data['infoIds']
                url = root_url % infoIds
                json = tools.get_json_by_requests(url, headers=HEADERS)
                articles = json['data']

                # "EMOTION": 'vint_3',
                # "ACCOUNT": null,
                # "WEIGHT": 0,
                # "TITLE": "str_title",
                # "URL": "str_url",
                # "MAY_INVALID": ,
                # "CLUES_IDS": "",
                # "WEBSITE_NAME": "str_site",
                # "KEYWORDS_COUNT": 1,
                # "HOST": "str_site",
                # "INFO_TYPE": 'int_type',
                # "COMMENT_COUNT": null,
                # "HOT_ID": "vint_%d"%hot_id,
                # "REVIEW_COUNT": null,
                # "UUID": "73ec16038e074530ff109e3cfad2594c",
                # "ID": 'vint_%d'%article_id,
                # "IS_VIP": null,
                # "IMAGE_URL": 'str_picture',
                # "KEYWORDS": "str_keywords",
                # "KEYWORD_CLUES_ID": "{"中央电视台":"88758"}",
                # "RELEASE_TIME": "date_pubtime",
                # "AUTHOR": "江门日报",
                # "CONTENT": "clob_content",
                # "RECORD_TIME": 'vdate_%s'%tools.get_current_date(),
                # "UP_COUNT": 'vint_null'

                key_map = {
                    'id': 'int_dataId',
                    'content': 'clob_content',
                    'url': 'str_url',
                    'website_name': 'str_site',
                    'image_url': 'str_picture',
                    'release_time': 'date_pubtime',
                    'keywords': 'str_keywords',
                    'emotion': 'str_emotion',
                    'host': 'str_site',
                    'title': 'str_title',
                    'info_type': 'int_type',
                    'hot_id': "vint_%d" % hot_id,
                    'record_time': 'vdate_%s' % tools.get_current_date()
                }

                export_data.export_to_oracle(
                    key_map=key_map,
                    aim_table='TAB_IOPM_ARTICLE_INFO',
                    unique_key='url',
                    datas=articles,
                    unique_key_mapping_source_key={'url': 'str_url'},
                    sync_to_es=True)
Exemplo n.º 27
0
    def update_task_status(self, tasks, status):
        TaskService._lock.acquire()  #加锁
        for task in tasks:
            website_id = task[0]

            sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s" % (
                tools.get_current_date(), status, website_id)

            TaskService._db.update(sql)
        TaskService._lock.release()
Exemplo n.º 28
0
def add_anchor_info(table,
                    site_id,
                    name='',
                    image_url='',
                    room_id='',
                    room_url='',
                    video_path='',
                    watched_count='',
                    fans_count='',
                    sex='',
                    age='',
                    address='',
                    live_view=1,
                    watched_count_url=''):
    '''
    @summary:
    ---------
    @param table: 表名
    @param site_id: 网站id
    @param name: 主播名
    @param image_url: 贴图地址
    @param room_id: 房间号
    @param room_url: 房间网页的url
    @param video_path: 房间视频流地址
    @param watched_count: 观众数
    @param fans_count: 粉丝数
    @param sex:  性别
    @param age:  年龄
    @param address:   主播所在地址(城市)
    @param live_view: 直播状态(0 未直播 1 直播)
    @param watched_count_url: 实时观众数地址
    ---------
    @result:
    '''
    anchor_info_dict = {
        'site_id': site_id,
        'name': name,
        'image_url': image_url,
        'sex': sex,
        'age': age,
        'address': address,
        'fans_count': fans_count,
        'watched_count': watched_count,
        'room_id': room_id,
        'room_url': room_url,
        'video_path': video_path,
        'live_view': live_view,
        'record_time': tools.get_current_date(),
        'watched_count_url': watched_count_url,
        'read_status': 0
    }

    if not db.add(table, anchor_info_dict):
        anchor_info_dict.pop('_id')
        db.update(table, {'name': name}, anchor_info_dict)
def add_article_info(table, website_id, source_url, title, content):

    content_info_dict = {
        'site_id': website_id,
        'url': source_url,
        'title': title,
        'content': content,
        'record_time': tools.get_current_date(),
        'read_status': 0
    }

    db.add(table, content_info_dict)
def add_program_episode_info(table,
                             site_id,
                             program_id,
                             episode_num='',
                             time_length='',
                             episode_name='',
                             download_status='',
                             download_url='',
                             episode_url='',
                             summary='',
                             image_url='',
                             sto_path='',
                             play_count=''):
    '''
    @summary:
    ---------
    @param table:
    @param site_id:
    @param program_id: 节目id
    @param episode_num: 当前集数
    @param time_length: 时长
    @param episode_name: 节目名称
    @param download_status: 下载状态
    @param download_url: 下载地址
    @param episode_url: 原文地址
    @param summary: 简介
    @param image_url: 图片地址
    ---------
    @result:
    '''

    download_status = 101 if sto_path else 102
    sto_id = 1 if sto_path else ''

    episode_info = {
        'site_id': site_id,
        'program_id': program_id,
        'episode_num': episode_num,
        'time_length': time_length,
        'episode_name': episode_name,
        'download_status': download_status,
        'download_url': download_url,
        'episode_url': episode_url,
        'summary': summary,
        'image_url': image_url,
        'read_status': 0,
        'record_time': tools.get_current_date(),
        'sto_path': sto_path,
        'sto_id': sto_id,
        'play_count': play_count
    }

    db.add(table, episode_info)