Exemplo n.º 1
0
    def store_count(self, list):
        status = False
        count = len(list)
        element = 0
        update = 0

        if count > 0:
            for program in list:
                try:
                    if self.check_for_exists(program) == 1:
                        update += 1
                    else:
                        id = genUUID()
                        self.pc.save(program, id)

                        element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' (' + str(element) +
                       ' Saved, ' + str(update) + ' Updated)')

            if element == 0 and update == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            # return False

        return status
Exemplo n.º 2
0
    def sogou_transfor(self):

        LogGo.info('搜狗转移')

        result = 0

        # sogou = SougouTransforRuler()
        #
        # existsUrls = self.news.get_all_title()
        # order = self.news.get_max_order_code('wechat')
        #
        # news, article = sogou.ExtraList(existsUrls, order)
        #
        # content = '搜狗转移任务(' + ')' + '\r\n'
        # content += '此次采集数量: ' + str(len(news)) + '\r\n'
        #
        # if self.store(news, article):
        #     result = 1
        # else:
        #     result = -1
        #
        # if result == 1:
        #     content += '存储成功!'
        #
        # SMTPServer.build_mission_report(content)
        # print(content)

        return result
Exemplo n.º 3
0
    def gs_detail(self, target, detail_page_bundle, content_ruler, encode):
        LogGo.info("On gs detail: " + str(target.data_key))

        try:
            order = self.wechat_order

            gs = GsdataRuler()
            result = gs.scan_detail(target, detail_page_bundle, order,
                                    content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == 3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 4
0
    def gensto(self, dao, list):
        status = False
        count = len(list)
        element = 0

        if count > 0:
            for item in list:
                try:
                    id = genUUID()
                    dao.save_or_update(item, id)
                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    # print(msg)
                    LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')
            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            stauts = False

        return status
Exemplo n.º 5
0
    def start_mormal_mission(self):
        global all_target_transported

        target_list = get_target_list()

        self.target_producer = target_producer(target_list,
                                               self.config.target_pool_size,
                                               self.config.target_queue_size)
        self.target_consumer = target_consumer()
        self.upload_consumer = upload_consumer(self.config.uploader_queue_size)

        self.upload_consumer.start()
        self.target_consumer.start()
        self.target_producer.start()

        # self.target_producer.pool.close()
        # self.target_producer.pool.join()

        while True:  # LogGo.debug(">>> target queue unfinishd count: " + str(self.target_producer.target_queue.queue.unfinished_tasks))
            time.sleep(5)
            LogGo.debug("target_transported_over: " +
                        str(target_producer.is_all_target_transported()))

        # self.target_consumer.queue.queue.join()
        # time.sleep(6000)

        LogGo.info('Loop Done! task count: ' + str(len(target_list)))
        SMTPServer.launch_mission_report()
Exemplo n.º 6
0
    def newrank_detail(self, target, detail_page_bundle, content_ruler,
                       encode):
        LogGo.info("On newrank detail: " + str(target.data_key))

        try:
            order = self.wechat_order

            newrank = NewrankRuler()
            result = newrank.scan_detail(target, detail_page_bundle, order,
                                         content_ruler, encode)

            code, detail_page_result = result

            if code == 1:
                if detail_page_result is not None:
                    return (1, detail_page_result)
            elif code == -3:
                self.temp_list.append(
                    (target, detail_page_bundle, content_ruler, encode))

        except Exception:
            LogGo.warning('error in newrank detail')
            return (-1, None)

        return (0, None)
Exemplo n.º 7
0
    def check_data_base(self):
        """数据库结构检查"""

        list = [
            TBNews(),
            TBArticle(),
            TBProgram(),
            TBWenzhangInfo(),
            TBDictionaryType(),
            TBDictionary(),
            TBNewsGroup(),
            TBNlpFilter(),
            TBScrapingTarget(),
            TBSoap(),
            TBSoapTarget(),
            TBGlobalTarget(),
            TBMR(),
            TBSpecialTarget(),
            TBProgramType(),
            TBSoapBlackList(),
            TBHeavyText()
        ]
        # list = [TBSoapBlackList()]

        guard = StructureGuard(Configs())
        log = guard.check(list)

        if len(log) > 0:
            LogGo.info(str(log))
Exemplo n.º 8
0
    def genup(self, dao, list):
        status = False
        count = len(list)
        element = 0

        if count > 0:
            for (update, where) in list:
                try:
                    dao.update(update, where)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    # print(msg)
                    LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Updated!')
            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")
            stauts = False

        return status
Exemplo n.º 9
0
    def store_program_type(self, pair):
        status = False

        count = len(pair)
        element = 0

        if count > 0:
            for item in pair:
                try:
                    program, type = item
                    self.program_type.save_by_program_type(program, type)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")

        return status
Exemplo n.º 10
0
    def store_soap_target(self, targets: [], banned: bool = False):
        status = False
        element = 0

        for target in targets:
            try:
                if banned:
                    self.banned_program.save(target)
                else:
                    self.soap_target.save(target)

                element += 1
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                LogGo.warning(msg)

        LogGo.info('Total :' + str(element) + ' / ' + str(len(targets)) +
                   ' elements Saved!')

        if element == 0:
            status = False
        else:
            status = True

        return status
Exemplo n.º 11
0
    def store_program(self, ids, programs):
        status = False

        if len(ids) != len(programs):
            LogGo.error("ids count unmatch programs count")
            return False

        count = len(ids)
        element = 0

        if count > 0:
            for id, program in zip(ids, programs):
                try:
                    self.program.save(program, id)

                    element += 1
                except Exception as e:
                    import traceback
                    msg = traceback.format_exc()
                    LogGo.warning(msg)

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
        else:
            LogGo.info("0 Element!")

        return status
Exemplo n.º 12
0
    def ExtraList(self, target, existsUrls, order):

        # order = ScrappdeDataDao.get_max_order_code()  # 数据库中排序代码
        result = []

        url = str(target.extra0)
        next_index = ""

        """抓取地址"""
        raw = WechatRuler.req._get(url)

        try:
            trup = ExtraJSON.extraWechatList(raw, 'msgList', WechatRuler.keys)
            list = trup[0]
            next_index = str(trup[1])
        except Exception as e:
            print(e)
            print("ERROR")
            return result

        while True:
            try:
                print('>>> scaning id: ' + next_index)
                LogGo.info('>>> scaning id: ' + next_index)
                tup = self.loopToFail(url, next_index)

                re_list = tup[0]
                next_index = str(tup[1])
                is_continue = tup[2]

                if len(re_list) > 0:
                    for item in re_list:
                        list.append(item)
                    # break
                else:
                    break

                if is_continue != 1:
                    break

            except Exception as e:
                print(e)
                break

        print('>>> list scaning completed')
        print('>>>')

        list.reverse()

        print('>>> Start Build SQL')
        result = self.build_base_dic(target,list,existsUrls,order)
        print('>>> Build SQL Success')
        print('>>>')

        return result
Exemplo n.º 13
0
    def start(self):
        """
        开启线程池,把所有 Target 放入池中,单独获取抓取列表
        :return:
        """

        while True:
            LogGo.info("Start target pool")
            [
                self.pool.apply_async(target_producer.worker, (target, ))
                for target in self.targets
            ]
            time.sleep(Configs().work_interval)
Exemplo n.º 14
0
    def weibo_list(self, target):
        LogGo.info("On weibo list: " + str(target.data_key))

        try:
            exists = self.exists_identifier

            weibo = WeiboRuler()
            code, value = weibo.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 15
0
    def newrank_list(self, target):
        LogGo.info("On newrank list: " + str(target.data_key))

        try:
            exists = self.exists_title

            newrank = NewrankRuler()
            code, value = newrank.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

            return (0, None)
Exemplo n.º 16
0
    def gs_list(self, target):
        LogGo.info("On gs list: " + str(target.data_key))

        try:
            exists = self.exists_signature

            gs = GsdataRuler()
            code, value = gs.scan_list(target, exists)

            return (code, value)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 17
0
    def store(self, news_list: [], article_list: [], heavy_list: [] = None):
        status = False

        if len(article_list) != len(news_list):
            LogGo.error("news count unmatch article count")
            return False

        count = len(news_list)
        element = 0

        if count > 0:
            if heavy_list is not None:
                for news, article, heavy in zip(news_list, article_list,
                                                heavy_list):
                    try:
                        id = genUUID()

                        self.news.save(news, id)
                        self.article.save(article, id)
                        self.heavy.save_with_news_id(heavy, id)
                        element += 1
                    except Exception as e:
                        LogGo.warning(repr(e))
            else:
                for news, article in zip(news_list, article_list):
                    try:
                        id = genUUID()

                        self.news.save(news, id)
                        self.article.save(article, id)
                        element += 1
                    except Exception as e:
                        LogGo.warning(repr(e))

            LogGo.info('Total :' + str(count) + ' / ' + str(element) +
                       ' elements Saved!')

            if element == 0:
                status = False
            else:
                status = True
            # return True
        else:
            LogGo.info("0 Element!")
            # return False

        return status
Exemplo n.º 18
0
    def scrape_list(self, target):
        LogGo.info("On scrape list: " + str(target.data_key))

        try:
            exists = self.exists_url

            ulweb = UlWebRuler()
            code, value = ulweb.scan_list(target, exists)

            return (code, value)
        except WebTargetOutOfDateException as e:
            LogGo.warning(e.args[0])
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))

        return (0, None)
Exemplo n.º 19
0
    def start(self):
        result = 0

        try:

            LogGo.info('搜狗爬虫')

            # Updatemp.loot()

            return result
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            print(msg)
            LogGo.warning(e)
            result = -1

        return result
Exemplo n.º 20
0
    def weibo_detail(self, target, detail_page_bundle, content_ruler, encode):
        LogGo.info("On weibo detail: " + str(target.data_key))

        try:
            order = self.weibo_order

            weibo = WeiboRuler()
            detail_page_result_dic = weibo.scan_detail(target,
                                                       detail_page_bundle,
                                                       order, content_ruler,
                                                       encode)

            if detail_page_result_dic is not None:
                return (1, detail_page_result_dic)
        except Exception as e:
            import traceback
            LogGo.warning(repr(e))
            return (-1, e.args[0])

        return (0, None)
Exemplo n.º 21
0
    def store_soap(self, soap):
        status = False

        if len(soap) < 1:
            LogGo.info("no data to save!")
            return False

        try:
            id = genUUID()
            self.soap.save(soap, id)

            status = True
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

        LogGo.info('Soap Saved')

        return status
Exemplo n.º 22
0
    def extraAnyList(rawData, startStr, keys):

        index = rawData.index(startStr)

        elementlist = []

        while (1):
            try:
                list = ExtraJSON.anyList(rawData, keys, index)

                if (list[1] == 0):
                    # print("----  Over  ----")

                    break

                elementlist.insert(0, list[2])

                index = list[1]
            except Exception as e:
                LogGo.info(e)
                break

        return elementlist
Exemplo n.º 23
0
    def send_request(self, result_dic):
        json_dic = dict()
        json_dic['date'] = DateGo.get_current_date()
        json_dic['targetId'] = 'No Target ID!'
        json_dic['rowList'] = [result_dic]

        try:
            LogGo.info("Ready to Post!")

            raw = RequestHelper.post(Configs.fish_data_post_url, json = json_dic)

            preview_dic = result_dic.copy()
            preview_dic['text_not_format_clob'] = 'DUMMY CONTENT'
            preview_dic['text_blob'] = 'DUMMY CONTENT'
            json_dic['rowList'] = preview_dic

            json_str = json.dumps(json_dic)

            LogGo.info("POST CONTENT: " + json_str)
            LogGo.info("POST RESPONSE: " + str(raw))
        except Exception:
            E.out_err()
Exemplo n.º 24
0
    def worker(target):
        """
        普通扫描函数 首先获取列表 然后抓取详细页内容
        """
        # if not Configs.debuging:
        #     time.sleep(20)
        # time.sleep(1)

        time.sleep(random.randint(1, 4))

        last_access = target.last_access_date
        frequency = target.frequency

        if frequency is None:
            frequency = 0
        if last_access is None:
            last_access = DateGo.date_befor_days(365, True)

        distance = DateGo.distance(last_access)

        if distance < frequency:
            ScrabingTarget.set_elog(target.id, "frequency")
            LogGo.info("type: " + str(target.type) + str(target.soap_type) +
                       " name: " + str(target.data_key) +
                       " skipped, last access data: " + str(last_access) +
                       "\r\n")

        if target.type == 'ulweb' or target.type == 'jsweb':
            msg = '>>> Ulweb(' + target.data_key + ')' + ' last acc: ' + str(
                last_access)
            LogGo.info(msg)

            web = WebStrategy()
            result = web.scrape_list(target)

            target_producer.send_to_queue(result)

        # elif target.type == 'wechat':#微信官方 规律 历史消息
        #     msg = '\r\n' + '>>> Wechat(' + target.extra0 + ')'
        #     LogGo.info(msg)
        #     wechat = WechatStrategy()
        #     result = wechat.ScanWechatTarget(target)
        #
        #     return (target, 1, result)

        elif target.type == 'newrank':
            msg = '>>> Newrank(' + target.extra0 + ')' + ' last acc: ' + str(
                last_access)
            LogGo.info(msg)

            wechat = WechatStrategy()
            result = wechat.newrank_list(target)

            target_producer.send_to_queue(result)

        elif target.type == 'gsdata':
            msg = '>>> Gsdata(' + target.extra0 + ')' + ' last acc: ' + str(
                last_access)
            LogGo.info(msg)

            wechat = WechatStrategy()
            result = wechat.gs_list(target)

            target_producer.send_to_queue(result)

        elif target.type == 'weibo':
            msg = '>>> Weibo(' + target.extra0 + ')' + ' last acc: ' + str(
                last_access)
            LogGo.info(msg)

            weibo = WeiboStrategy()
            result = weibo.weibo_list(target)

            target_producer.send_to_queue(result)
Exemplo n.º 25
0
    def build_base_dic(self,target,list,existsUrls,order):
        news = TBNews()
        article = TBArticle()

        picture_dao = PictureDao()
        result = []
        article_result = []

        """抓取正文"""
        for i in list:
            try:
                i['content_url'] = UrlHelper.unify(i['content_url']) #StringHelper.unescape(i['content_url'])

                if existsUrls.count(i['content_url']) < 1:  # getattr(i, 'url')

                    LogGo.info(">>> file id: " + str(i['fileid']))
                    LogGo.info(">>> url: " + str(i['content_url']))

                    try:
                        tup = ExtraJSON.wechat_extra_content(i['content_url'])  # getattr(i, 'url')
                    except Exception as e:
                        print(e)
                        print(">>>  ")
                        print(">>> extra content error.")
                        print(">>>  ")
                        LogGo.info("extra content error.")
                        LogGo.info("possible a deleted msg")
                        # LogGo.info("url: " + i['content_url'])
                        continue

                    raw_content = tup[1]
                    content = tup[2]

                    """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
                    dic = dict()
                    article_dic = dict()

                    order = order + 5
                    dic[news.order_code.key] = order  # """排序代码"""
                    dic[news.create_date.key] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')  # """此条记录创建时间"""
                    dic[news.valid.key] = 1
                    dic[news.text_not_format.key] = content #"""去除标签的正文内容"""
                    # dic[news.text_blob.key] = raw_content #"""原始带标签字段"""
                    dic[news.subscribe_time.key] = i['datetime']  # getattr(i, 'publicTime') """文章发表日期"""
                    dic[news.author.key] = i['author']  # getattr(i, 'author')"""文章所属机构"""
                    dic[news.title.key] = i['title']  # getattr(i, 'title')"""文章标题"""
                    dic[news.subject.key] = i['digest'] # """摘要"""
                    dic[news.status.key] = 2

                    picture_id = picture_dao.save_data(i['cover'])
                    dic[news.main_pic_id.key] = picture_id #"""列表图片 id"""

                    article_dic[article.fingerprint.key] = md5(i['content_url'])#"""由地址生成的指纹"""
                    article_dic[article.target_id.key] = target.id
                    article_dic[article.company.key] = target.data_key  # getattr(i, 'author') """文章所属机构"""
                    article_dic[article.content_url.key] = i['content_url']  # getattr(i, 'url')"""正文链接"""
                    article_dic[article.scrabble_type.key] = 'wechat'  # """文章类型 微信固定值为  wechat  """
                    article_dic[article.is_scrabbled.key] = 1  # """在数据库中作为 这是一条抓取到的数据 的标记"""

                    result.append(dic)
                    article_result.append(article_dic)
            except Exception as e:
                import traceback
                msg = traceback.format_exc()
                print(msg)
                LogGo.warning(repr(e))
                continue

        return result, article_result
Exemplo n.º 26
0
    def build_count_dic(self, pro_list):
        # pc = TBProgramPlayCount
        result = []

        try:

            LogGo.info(">>> count: " + str(len(pro_list)))

            for programs in pro_list:
                """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
                dic = dict()

                total = 0

                for program in programs:
                    try:
                        dic[TBProgramPlayCount.program.key] = program[
                            TBSoap.program.key]
                        plantform = program[TBSoap.plantform.key]
                        count = program[TBSoap.play_count.key]

                        total += count

                        if self.td(plantform) == 'i':
                            dic[TBProgramPlayCount.count1.key] = count
                        elif self.td(plantform) == 'l':
                            dic[TBProgramPlayCount.count2.key] = count
                        elif self.td(plantform) == 't':
                            dic[TBProgramPlayCount.count3.key] = count
                        elif self.td(plantform) == 'm':
                            dic[TBProgramPlayCount.count4.key] = count
                        elif self.td(plantform) == 'y':
                            dic[TBProgramPlayCount.count5.key] = count
                        elif self.td(plantform) == 's':
                            dic[TBProgramPlayCount.count6.key] = count
                    except Exception as e:
                        import traceback
                        msg = traceback.format_exc()
                        LogGo.info(msg)

                dic[TBProgramPlayCount.total_count.key] = total
                dic[TBProgramPlayCount.
                    create_time.key] = datetime.datetime.now().strftime(
                        '%Y-%m-%d')  # """此条记录创建时间"""

                result.append(dic)
        except BaseDateLackException as e:
            msg = "Lake improtant data(" + str(e) + ')'
            LogGo.warning(msg)
        except DataFormatException as e:
            pass
            # msg = "Date format error: " + i['link'] + '\r\n' + str(e)
            # LogGo.warning(msg)
        except KeyError as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

        return result
Exemplo n.º 27
0
    def build_base_dic(self, target, result, order):
        soap = TBSoap()
        program_dao = ProgramDao()
        soap_result = []

        try:

            name = ''
            if Configs.show_utf:
                try:
                    name = target.data_key
                except:
                    name = '<<error>>'

            LogGo.info(">>> name: " + str(name) + "(" +
                       str(result['playCount']) + ")")
            """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
            dic = dict()

            try:
                dic[soap.play_count.key] = result['playCount']  #瞬时播放量
            except KeyError as e:
                raise BaseDateLackException(str(e))

            try:
                dic[soap.keywords.key] = result['keywords']  # 关键字
            except:
                pass

            try:
                dic[soap.bullet_count.key] = result['bullets']  # 弹幕量
            except:
                pass

            try:
                dic[soap.hate_count.key] = result['hate']  # 怒踩量
            except:
                pass

            try:
                dic[soap.like_count.key] = result['like']  # 点赞量
            except:
                pass

            try:
                dic[soap.latest_order.key] = result['latestOrder']  # 最新剧集
            except:
                pass

            try:
                dic[soap.name.key] = result['name']  # 剧名
            except:
                pass

            try:
                dic[soap.name.key] = program_dao.get_title_by_id(
                    target.program_id)
            except:
                pass

            try:
                dic[soap.score.key] = result['score']  # 分数
            except:
                pass

            try:
                dic[soap.video_count.key] = result['videoCount']  # 视频数量
            except:
                pass

            try:
                # pass
                dic[soap.program.key] = target.program_id  # program
                dic[soap.target.key] = target.id  # program
            except:
                pass

            dic[soap.plantform.key] = target.soap_type

            order += 1
            dic[soap.order_code.key] = order  # """排序代码"""
            dic[soap.create_date.key] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')  # """此条记录创建时间"""
            dic[soap.valid.key] = 1

            soap_result.append(dic)
        except BaseDateLackException as e:
            msg = "Lake improtant data(" + str(e) + ')'
            LogGo.warning(msg)
        except DataFormatException as e:
            pass
            # msg = "Date format error: " + i['link'] + '\r\n' + str(e)
            # LogGo.warning(msg)
        except KeyError as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

        return soap_result
Exemplo n.º 28
0
    def build_single_page_dic(self, target, detail_page_bundle, order, content_ruler, encode):
        news = TBNews()
        article = TBArticle()

        result_dic = dict()

        try:
            LogGo.info(WeiboRuler.url_status + detail_page_bundle['id'])

            # blob = i['text'].encode("UTF-8")

            """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
            news_dic = dict()
            article_dic = dict()

            """排序代码"""
            order += 2
            news_dic[news.order_code.key] = order
            # dic[news.text_not_format.key] = i['text']#"""去除标签的正文内容"""
            # dic[news.text_blob.key] = blob #"""原始带标签字段"""

            sub_tim = detail_page_bundle['created_at']
            if sub_tim is not None:
                news_dic[news.subscribe_time.key] = sub_tim  # getattr(i, 'publicTime') """文章发表日期"""
            else:
                LogGo.warning("no subscribe time!")

            news_dic[news.create_date.key] = DateGo.get_current_date()  # """此条记录创建时间"""
            news_dic[news.status.key] = 1  # """状态"""
            news_dic[news.valid.key] = 1

            news_dic[news.title.key] = detail_page_bundle['text']
            news_dic[news.text_not_format.key] = detail_page_bundle['text']
            news_dic[news.text_blob.key] = detail_page_bundle['text']

            # title = None
            # try:
            #     title = i['page_info']
            #     title = title['content1']
            # except Exception as e:
            #     pass

            # if title is None:
            #     dic[news.title.key] = i['text']  # getattr(i, 'title') """文章标题"""
            # else:
            #     dic[news.title.key] = title # """文章标题"""

            """文章所属机构"""
            try:
                user = detail_page_bundle['user']
                screen_name = user['screen_name']
                article_dic[article.company.key] = screen_name  # getattr
            except Exception as e:
                pass

            article_dic[article.vote_up_count.key] = detail_page_bundle['attitudes_count']  # getattr(i, 'likeCount') """点赞数"""
            article_dic[article.scrabble_type.key] = 'weibo' #"""文章类型"""
            article_dic[article.is_scrabbled.key] = 1 #"""在数据库中作为 这是一条抓取到的数据 的标记"""
            article_dic[article.identifier.key] = detail_page_bundle['id'] #"""数据在母体中的 id"""
            article_dic[article.target_id.key] = target.id
            article_dic[article.content_url.key] = WeiboRuler.url_status + detail_page_bundle['id']  # getattr(i, 'url') """正文链接"""

            article_dic[article.publishStatus.key] = 1
            # article_dic[article.messageType.key] = random.randint(0, 1)

            """如果是回复 或者 引用 会有被引用的微博,记录那个微博的 id"""
            try:
                retweeted_status = detail_page_bundle['retweeted_status']
                ret_id = retweeted_status['id']

                article_dic[article.identifier_re.key] = ret_id
            except Exception as e:
                pass

            """阅读量"""
            # dic['click_count'] = i['clicksCount'] #getattr(i, 'clicksCount')
            """转发数"""
            """评论量"""

            # """图片组"""
            # try:
            #     pics = i['pics']
            #     if len(pics) > 0:
            #         group_id = PictureDao.save_group_data(pics)
            #         if group_id is not None:
            #             dic['group_picture_id'] = group_id
            # except Exception as e:
            #     print(e)
            #     LogGo.warning(dic['content_url'])
            #     LogGo.warning(e)

            result_dic.update(article_dic)
            result_dic.update(news_dic)
        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

            return None

        return result_dic
Exemplo n.º 29
0
    def scan_detail(self, target, detail_page_bundle, order, content_ruler,
                    encode):
        news = TBNews()
        article = TBArticle()

        # picture_dao = PictureDao()

        result_dic = dict()

        try:
            info = self.ready_info(detail_page_bundle['title'],
                                   detail_page_bundle['url'])
            LogGo.info(info)

            try:
                # tup = ExtraJSON.wechat_extra_content(detail_page_bundle['url'])
                tup = self.jsons.wechat_extra_content(
                    detail_page_bundle['url'])
            except HttpConnectionFailedException as e:
                LogGo.warning(repr(e))
                return (-3, None)
            except AttributeError:
                LogGo.warning(
                    "Maybe a deleted msg, complete the code to detect this error"
                )
                return (-2, None)
            except Exception:
                LogGo.warning("Error when get detail message!")
                return (-2, None)

            raw_content = tup[1]
            content = tup[2]
            picture = tup[3]
            """字典的 键 对应数据库中的字段名 值 对应要存储的值"""
            news_dic = dict()
            article_dic = dict()

            ############################## NEWS ###############################
            """列表图片 id"""
            # if picture is not None:
            #     picture_id = picture_dao.save_data(picture)
            #     news_dic[news.main_pic_id.key] = picture_id

            news_dic[news.text_not_format.key] = content  # """去除标签的正文内容"""
            # dic[news.text_blob.key] = raw_content#"""原始带标签字段"""
            news_dic[news.subscribe_time.key] = detail_page_bundle[
                'publicTime']  # """文章发表日期"""
            news_dic[news.create_date.key] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')  # """此条记录创建时间"""
            news_dic[news.subject.key] = detail_page_bundle[
                'summary']  # """摘要"""
            news_dic[news.valid.key] = 1
            news_dic[news.author.key] = detail_page_bundle['author']
            news_dic[news.title.key] = detail_page_bundle[
                'title']  # """文章标题"""
            news_dic[news.status.key] = 2
            order += 5
            news_dic[news.order_code.key] = order  # """排序代码"""

            ############################## ARTICLE ###############################

            article_dic[article.content_url.key] = detail_page_bundle[
                'url']  # getattr(i, 'url')"""正文链接"""
            article_dic[article.fingerprint.key] = md5(
                detail_page_bundle['url'])  # """由地址生成的指纹"""
            article_dic[article.company.key] = target.data_key  # """文章所属机构"""
            article_dic[article.target_id.key] = target.id
            article_dic[article.raw_click_count.key] = detail_page_bundle[
                'clicksCount']  # getattr(i, 'clicksCount')#"""阅读量"""
            article_dic[article.vote_up_count.key] = detail_page_bundle[
                'likeCount']  # getattr(i, 'likeCount')"""点赞数"""
            article_dic[article.scrabble_type.
                        key] = 'wechat'  # """文章类型 微信固定值为  wechat  """
            article_dic[
                article.is_scrabbled.key] = 1  # """在数据库中作为 这是一条抓取到的数据 的标记"""

            article_dic[article.publishStatus.key] = 1
            # article_dic[article.messageType.key] = random.randint(0, 1)

            ############################## DIC ###############################

            result_dic.update(news_dic)
            result_dic.update(article_dic)
        except Exception:
            import traceback
            msg = traceback.format_exc()
            LogGo.warning(msg)

            return (-1, None)

        return (1, result_dic)
Exemplo n.º 30
0
    LogGo.init(Configs())
    RequestHelper.init(Configs())
    SMTPServer.init(Configs())
    Download(Configs())

    RequestHelperClassVer.init(Configs())
    ProxyHelper.init(Configs())
    MysqlHelper.init(Configs())
    BaseStrategy.init()


""" 程序入口 """
if __name__ == "__main__":
    base_init()

    LogGo.info('PID: ' + str(os.getpid()))
    FileHelper.record_pid(str(os.getpid()))
    LogGo.info("-- Xenoblade Online --")
    print(" ")
    print("---------Xenoblade v0.2(Super Beta Version)-----------")
    print(" ")
    print("dependencies:")
    print("APScheduler==3.3.1\r\n"
          "beautifulsoup4==4.5.3\r\n"
          "bosonnlp==0.8.0\r\n"
          "click==6.7\r\n"
          "crypto==1.4.1\r\n"
          "demjson==2.2.4\r\n"
          "Flask==0.12\r\n"
          "HTMLParser==0.0.2\r\n"
          "httplib2==0.10.3\r\n"