Exemplo n.º 1
0
    def scrape_detail(self, target, detail_page_bundle, content_ruler, encode):
        try:
            order = self.web_order

            ulweb = UlWebRuler()
            result = ulweb.scan_detail(target, detail_page_bundle, order,
                                       content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == -3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except WebTargetOutOfDateException as e:
            return (-1, e.args[0])
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 2
0
    def gensto(self, dao, list):
        status = False
        count = len(list)
        element = 0

        if count > 0:
            for item in list:
                try:
                    id = genUUID()
                    dao.save_or_update(item, id)
                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    # print(msg)
                    LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')
            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            stauts = False

        return status
Exemplo n.º 3
0
    def scan_list(self, target, exists):
        self.limited_forward_count = target.limited_forward_count
        self.limited_attitude_count = target.limited_attitude_count

        list = []
        result_list = []

        """模拟登陆"""
        status = 'you got it'

        """如果登陆成功"""
        if status != '':
            self.loops(target,exists,list)
            if len(list) < 1:
                return (0, (target, None, None, None))
        else:
            LogGo.warning("Weibo: Loop scan faild!")
            return (-1, (target, None, None, None))

        if len(list) > 0:
            list = self.purify(list)
            list.reverse()

            for item in list:
                if exists.count(item['id']) < 1:
                    result_list.append(item)

            LogGo.debug('newrank list length:' + str(len(result_list)))

        if len(result_list) > 0:
            return (1, (target, list, None, None))
        return(-1, (target, None, None, None))
Exemplo n.º 4
0
    def store_program_type(self, pair):
        status = False

        count = len(pair)
        element = 0

        if count > 0:
            for item in pair:
                try:
                    program, type = item
                    self.program_type.save_by_program_type(program, type)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")

        return status
Exemplo n.º 5
0
    def newrank_detail(self, target, detail_page_bundle, content_ruler,
                       encode):
        LogGo.info("On newrank detail: " + str(target.data_key))

        try:
            order = self.wechat_order

            newrank = NewrankRuler()
            result = newrank.scan_detail(target, detail_page_bundle, order,
                                         content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == -3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except Exception:
            LogGo.warning('error in newrank detail')
            return (-1, None)

        return (0, None)
Exemplo n.º 6
0
    def genup(self, dao, list):
        status = False
        count = len(list)
        element = 0

        if count > 0:
            for (update, where) in list:
                try:
                    dao.update(update, where)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    # print(msg)
                    LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Updated!')
            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            stauts = False

        return status
Exemplo n.º 7
0
    def store_count(self, list):
        status = False
        count = len(list)
        element = 0
        update = 0

        if count > 0:
            for program in list:
                try:
                    if self.check_for_exists(program) == 1:
                        update += 1
                    else:
                        id = genUUID()
                        self.pc.save(program, id)

                        element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' (' + str(element) +
                       ' Saved, ' + str(update) + ' Updated)')

            if element == 0 and update == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            # return False

        return status
Exemplo n.º 8
0
    def store_soap_target(self, targets: [], banned: bool = False):
        status = False
        element = 0

        for target in targets:
            try:
                if banned:
                    self.banned_program.save(target)
                else:
                    self.soap_target.save(target)

                element += 1
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                LogGo.warning(msg)

        LogGo.info('Total :' + str(element) + ' / ' + str(len(targets)) +
                   ' elements Saved!')

        if element == 0:
            status = False
        else:
            status = True

        return status
Exemplo n.º 9
0
    def store_program(self, ids, programs):
        status = False

        if len(ids) != len(programs):
            LogGo.error("ids count unmatch programs count")
            return False

        count = len(ids)
        element = 0

        if count > 0:
            for id, program in zip(ids, programs):
                try:
                    self.program.save(program, id)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")

        return status
Exemplo n.º 10
0
    def gs_detail(self, target, detail_page_bundle, content_ruler, encode):
        LogGo.info("On gs detail: " + str(target.data_key))

        try:
            order = self.wechat_order

            gs = GsdataRuler()
            result = gs.scan_detail(target, detail_page_bundle, order,
                                    content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == 3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 11
0
    def purify(self,list):
        if len(list) < 1:
            return []

        result = []

        for i in list:
            try:
                flag = True

                id = i['id']
                text = i['text']

                limited_attitude_count = i['attitudes_count']
                limited_forward_count = i['reposts_count']

                program_count = 0

                # 基础过滤重复id
                for seq in result[::-1]:
                    sid = seq['id']
                    if id == sid:
                        flag = False
                        break

                # 第一次节目名过滤(有可能会包含到非节目)
                if flag and text.count('《') < 1:
                    flag = False

                # 条二次参数过滤
                if flag and self.limited_attitude_count is not None and limited_attitude_count is not None:
                    if limited_attitude_count < self.limited_attitude_count:
                        flag = False

                if flag and self.limited_forward_count is not None and limited_forward_count is not None:
                    if limited_forward_count < self.limited_forward_count:
                        flag = False

                # 第三次依据节目名过滤
                if flag:
                    for program in self.exist_program:
                        if text.count(program) >= 1:
                            program_count = program_count + 1

                        if program_count > 3:
                            flag = False
                            break

                if flag:
                    result.append(i)
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                LogGo.warning(msg)

        return result
Exemplo n.º 12
0
    def start(self):
        try:
            if self.config.check_table:
                self.check_data_base()

            # RequestHelperClassVer.init(self.config)
            # ProxyHelper.init(self.config)
            # MysqlHelper.init(self.config)

            self.start_mormal_mission()
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)
Exemplo n.º 13
0
    def gs_list(self, target):
        LogGo.info("On gs list: " + str(target.data_key))

        try:
            exists = self.exists_signature

            gs = GsdataRuler()
            code, value = gs.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 14
0
    def newrank_list(self, target):
        LogGo.info("On newrank list: " + str(target.data_key))

        try:
            exists = self.exists_title

            newrank = NewrankRuler()
            code, value = newrank.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

            return (0, None)
Exemplo n.º 15
0
    def weibo_list(self, target):
        LogGo.info("On weibo list: " + str(target.data_key))

        try:
            exists = self.exists_identifier

            weibo = WeiboRuler()
            code, value = weibo.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 16
0
    def store(self, news_list: [], article_list: [], heavy_list: [] = None):
        status = False

        if len(article_list) != len(news_list):
            LogGo.error("news count unmatch article count")
            return False

        count = len(news_list)
        element = 0

        if count > 0:
            if heavy_list is not None:
                for news, article, heavy in zip(news_list, article_list,
                                                heavy_list):
                    try:
                        id = genUUID()

                        self.news.save(news, id)
                        self.article.save(article, id)
                        self.heavy.save_with_news_id(heavy, id)
                        element += 1
                    except Exception as e:
                        LogGo.warning(repr(e))
            else:
                for news, article in zip(news_list, article_list):
                    try:
                        id = genUUID()

                        self.news.save(news, id)
                        self.article.save(article, id)
                        element += 1
                    except Exception as e:
                        LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
            # return True
        else:
            LogGo.info("0 Element!")
            # return False

        return status
Exemplo n.º 17
0
    def scrape_list(self, target):
        LogGo.info("On scrape list: " + str(target.data_key))

        try:
            exists = self.exists_url

            ulweb = UlWebRuler()
            code, value = ulweb.scan_list(target, exists)

            return (code, value)
        except WebTargetOutOfDateException as e:
            LogGo.warning(e.args[0])
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 18
0
    def start(self):
        result = 0

        try:

            LogGo.info('搜狗爬虫')

            # Updatemp.loot()

            return result
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            print(msg)
            LogGo.warning(e)
            result = -1

        return result
Exemplo n.º 19
0
    def store_soap(self, soap):
        status = False

        if len(soap) < 1:
            LogGo.info("no data to save!")
            return False

        try:
            id = genUUID()
            self.soap.save(soap, id)

            status = True
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

        LogGo.info('Soap Saved')

        return status
Exemplo n.º 20
0
    def weibo_detail(self, target, detail_page_bundle, content_ruler, encode):
        LogGo.info("On weibo detail: " + str(target.data_key))

        try:
            order = self.weibo_order

            weibo = WeiboRuler()
            detail_page_result_dic = weibo.scan_detail(target,
                                                       detail_page_bundle,
                                                       order, content_ruler,
                                                       encode)

            if detail_page_result_dic is not None:
                return (1, detail_page_result_dic)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 21
0
    def scan_list(self, target, exists):
        """请求参数"""
        par = (['flag', 'true'], ['uuid', target.extra0])
        """抓取关键字"""
        keys = [
            'title', 'author', 'publicTime', 'url', 'clicksCount', 'likeCount',
            'publicTime', 'summary'
        ]

        list = []
        result_list = []

        try:
            raw = RequestHelper.post(NewrankRuler.url,
                                     par,
                                     file_cookie=Configs.newrank_cookie_file)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            # print(msg)
            LogGo.warning(msg)
            return (-1, (target, None, None, None))

        try:
            list = ExtraJSON.extra_newrank_wechat_list(raw, keys)
        except:
            return (-1, (target, None, None, None))

        if len(list) > 0:
            list.reverse()

            for item in list:
                if exists.count(item['title']) < 1:
                    result_list.append(item)

            LogGo.debug('newrank list length:' + str(len(result_list)))

        if len(result_list) > 0:
            return (1, (target, list, None, None))
        return (-1, (target, None, None, None))
Exemplo n.º 22
0
    def anyList(rawData, keys, start):
        par_list = keys

        # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

        con = []
        offect = start

        dic = dict()

        for key in par_list:
            try:
                content = ExtraJSON.getContent(rawData, key, offect)
                con.append(content)

                offect += content[1]
                # print(key + " : " + content[0])

                dic[key] = content[0]
            except Exception as e:
                LogGo.warning(e)
                continue

        # print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        # print("                                                 ")

        minLen = 0

        for i in con:
            minLen += i[1]

        try:
            nextIndex = rawData.index(keys[0], start + minLen)
        except ValueError as e:
            #LogGo.warning(e)
            return (con, 0)

        return (con, nextIndex, dic)
Exemplo n.º 23
0
    def loops(self,target,exists,result):
        try:
            base_url = target.extra0
            for i in range(0, Configs().length_weibo):  # [::-1]:
                print("page: " + str(i))

                list = self.build_and_request(WeiboRuler.keys, base_url, WeiboRuler.request_getindex, i)

                if len(list) == 0:
                    break
                for item in list:
                    """日常抓取时的重复验证"""
                    if exists.count(item['id']) < 1:
                        result.append(item)
                    else:
                        return
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            print(msg)
            LogGo.warning(repr(e))
            LogGo.warning("Scan Failed!")
            return
Exemplo n.º 24
0
    def scan_list(self, target, exists) -> ():
        # order = ScrappdeDataDao.get_max_order('web')  # 数据库中排序代码

        detail_page_bundle_list = []

        error_code = None

        first = target.extra0#'http://ent.people.com.cn/GB/81374/index1.html'
        second = target.extra1#'http://ent.people.com.cn/GB/81374/index2.html'
        third = target.extra2#'http://ent.people.com.cn/GB/81374/index3.html'

        str_parent_container = target.extra3#'(div,[class:ej_list_box clear])'
        str_list_container = target.extra4#'(li,[])'
        list_ruler = target.extra5#'link:a href()=;title:a;data:em'
        content_ruler = target.extra6

        list_json_path = target.extra9

        url = ''

        one = False

        if second == "" or second is None:
            one = True
            url = first

        analyser = None
        if one == False:
            analyser = Analyser(first, second, third)

        try:
            encode = self.extractor.get_page_encode(first)
        except HttpConnectionFailedException as e:
            return (-2, (target, None, None, None))

        for i in range(1, Configs.length_web):
            LogGo.info('scaning index: ' + str(i))

            if one:
                pass
            else:
                url = analyser.get_url(i)
            try:
                raw = self.req.get(url,encode=encode)

                if target.type == 'ulweb':
                    self.looper_html(detail_page_bundle_list, raw, exists, list_ruler, str_parent_container, str_list_container)
                elif target.type == 'jsweb':
                    raw = self.extra4_process(target.extra4, raw)

                    self.looper_js(detail_page_bundle_list, raw, exists, list_ruler, str_parent_container, list_json_path=list_json_path)
                if one:
                    break
            except AttributeError as e:
                pass
            except TypeError as e:
                import traceback
                LogGo.warning(repr(e))
            except HttpConnectionFailedException as e:
                import traceback
                LogGo.warning(repr(e))
            except Exception as e:
                import traceback
                LogGo.warning(repr(e))

        if len(detail_page_bundle_list) > 0:
            detail_page_bundle_list.reverse()
            detail_page_bundle_list = self.purify(detail_page_bundle_list,'link')

            return (1, (target, detail_page_bundle_list, content_ruler, encode))
        else:
            return (-1, (target, None, None, None))
Exemplo n.º 25
0
    def build_base_dic(self,target,list,existsUrls,order):
        news = TBNews()
        article = TBArticle()

        picture_dao = PictureDao()
        result = []
        article_result = []

        """抓取正文"""
        for i in list:
            try:
                i['content_url'] = UrlHelper.unify(i['content_url']) #StringHelper.unescape(i['content_url'])

                if existsUrls.count(i['content_url']) < 1:  # getattr(i, 'url')

                    LogGo.info(">>> file id: " + str(i['fileid']))
                    LogGo.info(">>> url: " + str(i['content_url']))

                    try:
                        tup = ExtraJSON.wechat_extra_content(i['content_url'])  # getattr(i, 'url')
                    except Exception as e:
                        print(e)
                        print(">>>  ")
                        print(">>> extra content error.")
                        print(">>>  ")
                        LogGo.info("extra content error.")
                        LogGo.info("possible a deleted msg")
                        # LogGo.info("url: " + i['content_url'])
                        continue

                    raw_content = tup[1]
                    content = tup[2]

                    """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
                    dic = dict()
                    article_dic = dict()

                    order = order + 5
                    dic[news.order_code.key] = order  # """排序代码"""
                    dic[news.create_date.key] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # """此条记录创建时间"""
                    dic[news.valid.key] = 1
                    dic[news.text_not_format.key] = content #"""去除标签的正文内容"""
                    # dic[news.text_blob.key] = raw_content #"""原始带标签字段"""
                    dic[news.subscribe_time.key] = i['datetime']  # getattr(i, 'publicTime') """文章发表日期"""
                    dic[news.author.key] = i['author']  # getattr(i, 'author')"""文章所属机构"""
                    dic[news.title.key] = i['title']  # getattr(i, 'title')"""文章标题"""
                    dic[news.subject.key] = i['digest'] # """摘要"""
                    dic[news.status.key] = 2

                    picture_id = picture_dao.save_data(i['cover'])
                    dic[news.main_pic_id.key] = picture_id #"""列表图片 id"""

                    article_dic[article.fingerprint.key] = md5(i['content_url'])#"""由地址生成的指纹"""
                    article_dic[article.target_id.key] = target.id
                    article_dic[article.company.key] = target.data_key  # getattr(i, 'author') """文章所属机构"""
                    article_dic[article.content_url.key] = i['content_url']  # getattr(i, 'url')"""正文链接"""
                    article_dic[article.scrabble_type.key] = 'wechat'  # """文章类型 微信固定值为  wechat  """
                    article_dic[article.is_scrabbled.key] = 1  # """在数据库中作为 这是一条抓取到的数据 的标记"""

                    result.append(dic)
                    article_result.append(article_dic)
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                print(msg)
                LogGo.warning(repr(e))
                continue

        return result, article_result
Exemplo n.º 26
0
    def scan_detail(self, target, detail_page_bundle, order, content_ruler,
                    encode):
        news = TBNews()
        article = TBArticle()

        # picture_dao = PictureDao()

        result_dic = dict()

        try:
            info = self.ready_info(detail_page_bundle['title'],
                                   detail_page_bundle['url'])
            LogGo.info(info)

            try:
                # tup = ExtraJSON.wechat_extra_content(detail_page_bundle['url'])
                tup = self.jsons.wechat_extra_content(
                    detail_page_bundle['url'])
            except HttpConnectionFailedException as e:
                LogGo.warning(repr(e))
                return (-3, None)
            except AttributeError:
                LogGo.warning(
                    "Maybe a deleted msg, complete the code to detect this error"
                )
                return (-2, None)
            except Exception:
                LogGo.warning("Error when get detail message!")
                return (-2, None)

            raw_content = tup[1]
            content = tup[2]
            picture = tup[3]
            """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
            news_dic = dict()
            article_dic = dict()

            ############################## NEWS ###############################
            """列表图片 id"""
            # if picture is not None:
            #     picture_id = picture_dao.save_data(picture)
            #     news_dic[news.main_pic_id.key] = picture_id

            news_dic[news.text_not_format.key] = content  # """去除标签的正文内容"""
            # dic[news.text_blob.key] = raw_content#"""原始带标签字段"""
            news_dic[news.subscribe_time.key] = detail_page_bundle[
                'publicTime']  # """文章发表日期"""
            news_dic[news.create_date.key] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')  # """此条记录创建时间"""
            news_dic[news.subject.key] = detail_page_bundle[
                'summary']  # """摘要"""
            news_dic[news.valid.key] = 1
            news_dic[news.author.key] = detail_page_bundle['author']
            news_dic[news.title.key] = detail_page_bundle[
                'title']  # """文章标题"""
            news_dic[news.status.key] = 2
            order += 5
            news_dic[news.order_code.key] = order  # """排序代码"""

            ############################## ARTICLE ###############################

            article_dic[article.content_url.key] = detail_page_bundle[
                'url']  # getattr(i, 'url')"""正文链接"""
            article_dic[article.fingerprint.key] = md5(
                detail_page_bundle['url'])  # """由地址生成的指纹"""
            article_dic[article.company.key] = target.data_key  # """文章所属机构"""
            article_dic[article.target_id.key] = target.id
            article_dic[article.raw_click_count.key] = detail_page_bundle[
                'clicksCount']  # getattr(i, 'clicksCount')#"""阅读量"""
            article_dic[article.vote_up_count.key] = detail_page_bundle[
                'likeCount']  # getattr(i, 'likeCount')"""点赞数"""
            article_dic[article.scrabble_type.
                        key] = 'wechat'  # """文章类型 微信固定值为  wechat  """
            article_dic[
                article.is_scrabbled.key] = 1  # """在数据库中作为 这是一条抓取到的数据 的标记"""

            article_dic[article.publishStatus.key] = 1
            # article_dic[article.messageType.key] = random.randint(0, 1)

            ############################## DIC ###############################

            result_dic.update(news_dic)
            result_dic.update(article_dic)
        except Exception:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

            return (-1, None)

        return (1, result_dic)
Exemplo n.º 27
0
    def scan(self, target, order):
        result = []

        type = self.td(target)
        url = target.extra0  #'http://ent.people.com.cn/GB/81374/index1.html'

        cap = None
        ruler = None

        if self.td(target) == 'i':
            cap = ['var tvInfoJs=', '']
            url = self.iqiyi_base.format(url)
            ruler = 'keywords:contentKeyword;latestOrder:latestOrder;name:name;playCount:playCount;score:score;videoCount:videoCount'

        elif type == 'l':
            ruler = 'score:plist_score;comments:pcommon_count;bullets:pdm_count;like:up;hate:down;playCount:plist_play_count'
            url = self.letv_base.format(url, target.extra1)

        elif type == 't':
            cap = ["tlux.dispatch('$cover',", ");"]
            ruler = 'score:score->score;playCount:view_all_count;videoCount:episode_all;latestOrder:episode_updatedd'
            url = self.qq_base.format(url)

        elif type == 'm':
            url = self.mgtv_base.format(url)
            cap = ['"data":', ',"msg"']
            ruler = 'playCount:all;like:like;hate:unlike'

        elif type == 'y':
            ruler = 'playCount:li [总播放数];comments:li [评论];like:li [顶];score:span class=star-num'
            if not s.is_url(url):
                if not url.startswith('id'):
                    url = self.youku_prefix.format(url)
                url = self.youku_base.format(url)

        elif type == 's':
            url = self.sohu_base.format(url)

        elif type == 'c':
            url = self.cntv_base.format(url)
            ruler = 'playCount:^label [播放次数]'

        try:
            encode = ExtraHtml.get_page_encode(url)

            if type == 'y' or type == 'c':
                result = self.looper_html(url, ruler, encode, target)
            else:
                raw = RequestHelper.get(url, encode=encode)

                if type == 's':
                    result = self.finder_sohu(raw)
                else:
                    result = self.looper_js(raw, ruler, cap)
        except AttributeError as e:
            pass
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            print(msg)
            LogGo.warning(repr(e))

        if len(result) > 0:
            result = self.build_base_dic(target, result, order)

        return result[0]
Exemplo n.º 28
0
    def build_count_dic(self, pro_list):
        # pc = TBProgramPlayCount
        result = []

        try:

            LogGo.info(">>> count: " + str(len(pro_list)))

            for programs in pro_list:
                """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
                dic = dict()

                total = 0

                for program in programs:
                    try:
                        dic[TBProgramPlayCount.program.key] = program[
                            TBSoap.program.key]
                        plantform = program[TBSoap.plantform.key]
                        count = program[TBSoap.play_count.key]

                        total += count

                        if self.td(plantform) == 'i':
                            dic[TBProgramPlayCount.count1.key] = count
                        elif self.td(plantform) == 'l':
                            dic[TBProgramPlayCount.count2.key] = count
                        elif self.td(plantform) == 't':
                            dic[TBProgramPlayCount.count3.key] = count
                        elif self.td(plantform) == 'm':
                            dic[TBProgramPlayCount.count4.key] = count
                        elif self.td(plantform) == 'y':
                            dic[TBProgramPlayCount.count5.key] = count
                        elif self.td(plantform) == 's':
                            dic[TBProgramPlayCount.count6.key] = count
                    except Exception as e:
                        import traceback
                        msg = traceback.format_exc()
                        LogGo.info(msg)

                dic[TBProgramPlayCount.total_count.key] = total
                dic[TBProgramPlayCount.
                    create_time.key] = datetime.datetime.now().strftime(
                        '%Y-%m-%d')  # """此条记录创建时间"""

                result.append(dic)
        except BaseDateLackException as e:
            msg = "Lake improtant data(" + str(e) + ')'
            LogGo.warning(msg)
        except DataFormatException as e:
            pass
            # msg = "Date format error: " + i['link'] + '\r\n' + str(e)
            # LogGo.warning(msg)
        except KeyError as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

        return result
Exemplo n.º 29
0
    def build_base_dic(self, target, result, order):
        soap = TBSoap()
        program_dao = ProgramDao()
        soap_result = []

        try:

            name = ''
            if Configs.show_utf:
                try:
                    name = target.data_key
                except:
                    name = '<<error>>'

            LogGo.info(">>> name: " + str(name) + "(" +
                       str(result['playCount']) + ")")
            """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
            dic = dict()

            try:
                dic[soap.play_count.key] = result['playCount']  #瞬时播放量
            except KeyError as e:
                raise BaseDateLackException(str(e))

            try:
                dic[soap.keywords.key] = result['keywords']  # 关键字
            except:
                pass

            try:
                dic[soap.bullet_count.key] = result['bullets']  # 弹幕量
            except:
                pass

            try:
                dic[soap.hate_count.key] = result['hate']  # 怒踩量
            except:
                pass

            try:
                dic[soap.like_count.key] = result['like']  # 点赞量
            except:
                pass

            try:
                dic[soap.latest_order.key] = result['latestOrder']  # 最新剧集
            except:
                pass

            try:
                dic[soap.name.key] = result['name']  # 剧名
            except:
                pass

            try:
                dic[soap.name.key] = program_dao.get_title_by_id(
                    target.program_id)
            except:
                pass

            try:
                dic[soap.score.key] = result['score']  # 分数
            except:
                pass

            try:
                dic[soap.video_count.key] = result['videoCount']  # 视频数量
            except:
                pass

            try:
                # pass
                dic[soap.program.key] = target.program_id  # program
                dic[soap.target.key] = target.id  # program
            except:
                pass

            dic[soap.plantform.key] = target.soap_type

            order += 1
            dic[soap.order_code.key] = order  # """排序代码"""
            dic[soap.create_date.key] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')  # """此条记录创建时间"""
            dic[soap.valid.key] = 1

            soap_result.append(dic)
        except BaseDateLackException as e:
            msg = "Lake improtant data(" + str(e) + ')'
            LogGo.warning(msg)
        except DataFormatException as e:
            pass
            # msg = "Date format error: " + i['link'] + '\r\n' + str(e)
            # LogGo.warning(msg)
        except KeyError as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

        return soap_result
Exemplo n.º 30
0
    def build_single_page_dic(self, target, detail_page_bundle, order, content_ruler, encode):
        news = TBNews()
        article = TBArticle()

        result_dic = dict()

        try:
            LogGo.info(WeiboRuler.url_status + detail_page_bundle['id'])

            # blob = i['text'].encode("UTF-8")

            """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
            news_dic = dict()
            article_dic = dict()

            """排序代码"""
            order += 2
            news_dic[news.order_code.key] = order
            # dic[news.text_not_format.key] = i['text']#"""去除标签的正文内容"""
            # dic[news.text_blob.key] = blob #"""原始带标签字段"""

            sub_tim = detail_page_bundle['created_at']
            if sub_tim is not None:
                news_dic[news.subscribe_time.key] = sub_tim  # getattr(i, 'publicTime') """文章发表日期"""
            else:
                LogGo.warning("no subscribe time!")

            news_dic[news.create_date.key] = DateGo.get_current_date()  # """此条记录创建时间"""
            news_dic[news.status.key] = 1  # """状态"""
            news_dic[news.valid.key] = 1

            news_dic[news.title.key] = detail_page_bundle['text']
            news_dic[news.text_not_format.key] = detail_page_bundle['text']
            news_dic[news.text_blob.key] = detail_page_bundle['text']

            # title = None
            # try:
            #     title = i['page_info']
            #     title = title['content1']
            # except Exception as e:
            #     pass

            # if title is None:
            #     dic[news.title.key] = i['text']  # getattr(i, 'title') """文章标题"""
            # else:
            #     dic[news.title.key] = title # """文章标题"""

            """文章所属机构"""
            try:
                user = detail_page_bundle['user']
                screen_name = user['screen_name']
                article_dic[article.company.key] = screen_name  # getattr
            except Exception as e:
                pass

            article_dic[article.vote_up_count.key] = detail_page_bundle['attitudes_count']  # getattr(i, 'likeCount') """点赞数"""
            article_dic[article.scrabble_type.key] = 'weibo' #"""文章类型"""
            article_dic[article.is_scrabbled.key] = 1 #"""在数据库中作为 这是一条抓取到的数据 的标记"""
            article_dic[article.identifier.key] = detail_page_bundle['id'] #"""数据在母体中的 id"""
            article_dic[article.target_id.key] = target.id
            article_dic[article.content_url.key] = WeiboRuler.url_status + detail_page_bundle['id']  # getattr(i, 'url') """正文链接"""

            article_dic[article.publishStatus.key] = 1
            # article_dic[article.messageType.key] = random.randint(0, 1)

            """如果是回复 或者 引用 会有被引用的微博,记录那个微博的 id"""
            try:
                retweeted_status = detail_page_bundle['retweeted_status']
                ret_id = retweeted_status['id']

                article_dic[article.identifier_re.key] = ret_id
            except Exception as e:
                pass

            """阅读量"""
            # dic['click_count'] = i['clicksCount'] #getattr(i, 'clicksCount')
            """转发数"""
            """评论量"""

            # """图片组"""
            # try:
            #     pics = i['pics']
            #     if len(pics) > 0:
            #         group_id = PictureDao.save_group_data(pics)
            #         if group_id is not None:
            #             dic['group_picture_id'] = group_id
            # except Exception as e:
            #     print(e)
            #     LogGo.warning(dic['content_url'])
            #     LogGo.warning(e)

            result_dic.update(article_dic)
            result_dic.update(news_dic)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

            return None

        return result_dic