示例#1
0
    def load_definitions(self, filename):
        print '\nLoading page definitions... (in Mongo)'

        mongo = MongoDB()

        if self.progress:
            num_lines = sum(1 for line in open(filename) if '<page' in line)

        elementnr = 1
        pipe = mongo.pipeline(transaction=False)

        for pageid, title, markup in wikidumps.extract_pages(filename):
            elementnr += 1
            if self.progress and elementnr % 10 is 0:
                cli_progress(elementnr, num_lines)
            try:
                if markup is not None:
                    definition = generate_markup_definition(markup)
                else:
                    definition = ''
                pipe.set(self.ns.page_definition(str(pageid)), definition)
                if len(pipe) >= 100:
                    pipe.execute()
            except Exception, e:
                print "Error loading on element: ", elementnr, str(e)
                continue
    def __init__(self, source_table, aim_table, key_map, unique_key=None):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key7' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key

        self._mongodb = MongoDB()

        self._is_oracle = False
        self._export_count = 0
    def __init__(self, tab_urls):
        super(Collector, self).__init__()
        self._lock = threading.RLock()

        self._db = MongoDB()
        self._thread_stop = False
        self._urls = []
        self._null_times = 0
        self._read_pos = -1
        self._write_pos = -1
        self._tab_urls = tab_urls
        self._depth = int(
            tools.get_conf_value('config.conf', "collector", "depth"))
        self._max_size = int(
            tools.get_conf_value('config.conf', "collector", "max_size"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        #初始时将正在做的任务至为未做
        self._db.update(self._tab_urls, {'status': Constance.DOING},
                        {'status': Constance.TODO})

        self._finished_callback = None
示例#4
0
    def load_definitions(self, filename):
        print '\nLoading page definitions... (in Mongo)'

        mongo = MongoDB()

        if self.progress:
            num_lines = sum(1 for line in open(filename) if '<page' in line) 
        
        elementnr = 1
        pipe = mongo.pipeline(transaction=False)
        
        for pageid, title, markup in wikidumps.extract_pages(filename):
            elementnr += 1
            if self.progress and elementnr % 10 is 0:
                cli_progress(elementnr, num_lines)
            try:
                if markup is not None:
                    definition = generate_markup_definition(markup)
                else:
                    definition = ''
                pipe.set(self.ns.page_definition(str(pageid)), definition)
                if len(pipe) >= 100:
                    pipe.execute()
            except Exception, e:
                print "Error loading on element: ", elementnr, str(e)
                continue
示例#5
0
    def export_to_oracle(self,
                         source_table='',
                         aim_table='',
                         key_map='',
                         unique_key=None,
                         unique_key_mapping_source_key=None,
                         update_read_status=True,
                         condition={'read_status': 0},
                         datas=[],
                         callback='',
                         sync_to_es=False):
        if aim_table:
            if self._aim_table != aim_table:
                self._is_set_unique_key = False
                self._es = ES() if sync_to_es else ''
                self._mongodb = MongoDB() if source_table else ''

            self._source_table = source_table
            self._aim_table = aim_table
            self._key_map = key_map
            self._unique_key = unique_key
            self._export_count = 0
            self._update_count = 0
            self._unique_key_mapping_source_key = unique_key_mapping_source_key
            self._update_read_status = update_read_status if not datas else False
            self._condition = condition
            self._datas = datas
            self._callback = callback
            self._sync_to_es = sync_to_es
            self._es = None

        self._aim_db = OracleDB()
        self._is_oracle = True

        return self.__export()
示例#6
0
def calc_words_dist():
    mongo = MongoDB(host='localhost', port=27017, db_name='mery')
    mongo.load_database()
    articles = mongo.load_collection('articles')
    word_distributions = {}
    categories = [u"ファッション", u"メイク・コスメ", u"ヘアスタイル", u"ネイル", u"美容",
                  u"グルメ", u"旅行・おでかけ", u"恋愛", u"ライフスタイル", ]
    total_words_dist = WordDistribution()
    for category in categories[:]:
        word_distributions[category] = WordDistribution()
        category_articles = articles.find({'category': category})
        print "====== {}: {} ======".format(category.encode('utf-8'), category_articles.count())
        for article in category_articles:
            sp = SentenceParser(article["title"])
            sp.parse()
            nouns = sp.extract_nouns()
            word_distributions[category].update_dist(nouns)  # カテゴリーの文書全体の単語頻度更新
            total_words_dist.update_dist(nouns)  # 文書全体の単語頻度更新
        word_distributions[category].calc_total_words_dist()

    dump_object(word_distributions, 'mery_category_word_dist.pkl')
    dump_object(word_distributions, 'mery_total_word_dist.pkl')

    # 上位0.1%以上の出現頻度の単語を取り除く
    total_words_dist.calc_total_words_dist()
    top_n_percent_words = total_words_dist.extract_top_n_percent_words(n=0.1)
    print "=== removed words ==="
    for word in top_n_percent_words:
        print word.encode('utf-8')
    # 結果を表示
    for category in categories[:]:
        word_distributions[category].remove_words(top_n_percent_words)  # 出現頻度が多い単語を削除
        print "====  {}  =====".format(category.encode('utf-8'))
        for word, count in sorted(word_distributions[category].total_words_dist.items(), key=lambda x: x[1], reverse=True)[:20]:
            print word.encode('utf-8'), count
示例#7
0
    def __init__(self,
                 source_table='',
                 aim_table='',
                 key_map='',
                 unique_key=None,
                 unique_key_mapping_source_key=None,
                 update_read_status=True,
                 condition={'read_status': 0},
                 datas=[],
                 callback=''):
        '''
        @summary: 初始化
        ---------
        @param source_table: 源table mongo数据库
        @param aim_table:    目标table
        @param key_map:      目标table 和 源table 的键的映射
        eg: key_map = {
            'aim_key1' : 'str_source_key2',          # 目标键 = 源键对应的值         类型为str
            'aim_key2' : 'int_source_key3',          # 目标键 = 源键对应的值         类型为int
            'aim_key3' : 'date_source_key4',         # 目标键 = 源键对应的值         类型为date
            'aim_key4' : 'vint_id',                  # 目标键 = 值                   类型为int
            'aim_key5' : 'vstr_name',                # 目标键 = 值                   类型为str
            'aim_key6' : 'vdate_name',               # 目标键 = 值                   类型为date
            'aim_key7' : 'sint_select id from xxx'   # 目标键 = 值为sql 查询出的结果 类型为int
            'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str
            'aim_key9' : 'clob_key8'                 # 目标键 = 源键对应的值         类型为clob
            'aim_key10' : 'clob_key8'                # 目标键 = 源键对应的值         类型为str
        }

        @param unique_key:    唯一的key 目标数据库根据该key去重
        @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据
         eg: unique_key_mapping_source_key = {
            'url':'str_url'                         # 目标键 = 源键对应的值         类型为str
         }
        @param condition:    导出满足什么样条件的数据 默认是read_status = 0 的
        @param datas:   要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据
        @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION)
        sql 为执行的语句
        ---------
        @result:
        '''

        super(ExportData, self).__init__()

        self._source_table = source_table
        self._aim_table = aim_table
        self._key_map = key_map
        self._unique_key = unique_key
        self._update_read_status = update_read_status
        self._condition = condition

        self._mongodb = MongoDB() if self._source_table else ''
        self._datas = datas

        self._is_oracle = False
        self._is_set_unique_key = False
        self._export_count = 0
        self._update_count = 0
        self._unique_key_mapping_source_key = unique_key_mapping_source_key
示例#8
0
    def __init__(self,
                 tab_urls,
                 tab_site='',
                 tab_content='',
                 parser_count=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key='url',
                 delete_tab_urls=False):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        if delete_tab_urls: self._db.delete(tab_urls)

        self._db.set_unique_key(tab_urls, 'url')
        if tab_site: self._db.set_unique_key(tab_site, 'site_id')
        if tab_content:
            self._db.set_unique_key(tab_content, content_unique_key)

        #设置索引 加快查询速度
        self._db.set_ensure_index(tab_urls, 'depth')
        self._db.set_ensure_index(tab_urls, 'status')
        if tab_site: self._db.set_ensure_index(tab_site, 'read_status')
        if tab_content: self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
示例#9
0
    def __init__(self,
                 tab_urls,
                 tab_site,
                 tab_content,
                 parser_count=None,
                 search_keyword1=[],
                 search_keyword2=[],
                 search_keyword3=[],
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key=None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param search_keyword1: 搜索关键字(列表)全部包含
        @param search_keyword2: 搜索关键字(列表)至少包含一个
        @param search_keyword3: 搜索关键字(列表)一个都不能包含
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        self._db.set_unique_key(tab_urls, 'url')
        self._db.set_unique_key(tab_site, 'site_id')
        self._db.set_unique_key(
            tab_content,
            'url' if not content_unique_key else content_unique_key)

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._search_keyword1 = search_keyword1
        self._search_keyword2 = search_keyword2
        self._search_keyword3 = search_keyword3

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
示例#10
0
    def __init__(self,
                 tab_list,
                 tab_unique_key_list,
                 tab_ensure_index_list,
                 parser_count=None,
                 site_parsers=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 delete_tab_urls=False):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的参数
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()
        self._db = MongoDB()

        self._tab_urls = tab_list[0]
        if delete_tab_urls: self._db.delete(self._tab_urls)

        self._site_parsers = site_parsers

        for tab_index in range(len(tab_list)):
            self._db.set_unique_key(tab_list[tab_index],
                                    tab_unique_key_list[tab_index])
            # 设置索引 加快查询速度
            for ensure_index in tab_ensure_index_list[tab_index]:
                self._db.set_ensure_index(tab_list[tab_index], ensure_index)

        self._collector = Collector(self._tab_urls, self._site_parsers)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
def main():
    db = MongoDB()

    def begin_callback():
        log.info('\n********** wp begin **********')
        db.delete('WP_urls', {})

    def end_callback():
        # 更新关键词状态 做完
        log.info('\n********** wp end **********')
        export_data.main()

    # 配置spider
    spider = Spider(tab_urls = 'WP_urls', tab_site = 'WP_site_info', tab_content = 'WP_content_info',
                    parser_count = 20, begin_callback = begin_callback, end_callback = end_callback,
                    content_unique_key = 'title')

    # 添加parser
    spider.add_parser(dongmanla_parser)
    # spider.add_parser(zx_novel_parser)
    # spider.add_parser(jisu_cartoon_parser)
    # spider.add_parser(ximalaya_parser)


    spider.start()
示例#12
0
def main():
    db = MongoDB()
    while True:

        def begin_callback():
            log.info('\n********** proxies begin **********')
            db.delete('proxies_urls')

        def end_callback():
            log.info('\n********** proxies end **********')

            # 更新任务状态 done

            # 导出数据
            # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '')
            # export_data.export_to_oracle()

        # 配置spider
        spider = Spider(tab_urls='proxies_urls',
                        tab_site='proxies_site_info',
                        tab_content='proxies_content_info',
                        parser_count=1,
                        begin_callback=begin_callback,
                        end_callback=end_callback,
                        parser_params={},
                        content_unique_key='ip')

        # 添加parser
        spider.add_parser(gaoni_parser)

        spider.start()

        # time.sleep(60)
        break
示例#13
0
class KeyWordsController(object):
    _mongodb = MongoDB()

    def __init__(self):
        pass

    def get_keywords_by_sex(self):
        filter = WordsFilter()
        #男性去掉停顿词后的集合
        male_total = filter.get_filter_result(
            self._mongodb.get_all_tweets_with_sex_flag(1))
        #女性去掉停顿词后的集合
        female_total = filter.get_filter_result(
            self._mongodb.get_all_tweets_with_sex_flag(2))

        #男性关键词
        maleKeyWordsList = filter.get_keywords_with_tag(male_total, 200)
        # 女性关键词
        femaleKeyWordsList = filter.get_keywords_with_tag(female_total, 200)

        with codecs.open(
                "C:\Users\chenyx\Desktop\NewDataSet\male_keyword_count.txt",
                "w", "utf-8") as f:
            for item in maleKeyWordsList:
                string = "%s\r\n" % (item)
                f.writelines(string)

        with codecs.open(
                "C:\Users\chenyx\Desktop\NewDataSet\_female_keyword_count.txt",
                "w", "utf-8") as f:
            for item in femaleKeyWordsList:
                string = "%s\r\n" % (item)
                f.writelines(string)
def main():
    db = MongoDB()

    def begin_callback():
        log.info('\n********** live_app begin **********')
        db.delete('LiveApp_urls', {})
        db.update('LiveApp_anchor_info', {}, {"live_view": 0})
        db.update('LiveApp_anchor_info', {}, {"watched_count": 0})
        db.update('LiveApp_anchor_info', {}, {'read_status': 0})

    def end_callback():
        # 更新关键词状态 做完
        log.info('\n********** live_app end **********')
        export_data.main()

    # 配置spider
    spider = Spider(tab_urls='LiveApp_urls',
                    tab_site='LiveApp_site_info',
                    tab_content='LiveApp_anchor_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    content_unique_key='room_id')

    # 添加parser
    spider.add_parser(inke_parser)
    spider.add_parser(huajiao_parser)
    spider.add_parser(momo_parser)

    spider.start()
示例#15
0
 def __init__(self):
     super().__init__()
     self.base_url = 'http://www.sosi55.com'
     self.rule = {
         'page_url': '/guochantaotu/list_22_%page.html',
         'page_rule': {
             "list": '.yuanma_downlist_box .pic a'
         },
         'post_rule': {
             'title': '.single h1'
         },
         'base_url': self.base_url,
     }
     self.charset = 'gbk'
     self.table = 'ii_sousi'
     self.cc = OpenCC('t2s')
     self.db = MongoDB(os.environ.get('MONGO'), 'sousi')
示例#16
0
def crawl_article(dicts):
    """
    :param dicts: 
    """
    for article_dict in dicts:
        sess = requests.Session()
        headers = get_header()
        url = article_dict.get('url')
        print("开始爬取:%s" % url)
        res = sess.get(url, headers=headers)
        selector = etree.HTML(res.text)
        rich_media = selector.xpath(
            "//div[@class='rich_media_inner']/div[@id='page-content']/div[1]/div[2]")[0]
        author = selector.xpath("//div[@id='meta_content']/span[@class='rich_media_meta rich_media_meta_text']")[
            0].xpath(
            "string(.)")
        __biz = url2dict(url).get('__biz', '')
        # 正文文字
        content = rich_media.xpath("string(.)")
        # 图片集合
        picture_urls = selector.xpath("//img/@data-src")
        # 视频集合
        video_urls = selector.xpath("//iframe[@class='video_iframe']/@data-src")
        json_info = get_article_info(url)
        if json_info is not None:
            like_num = json_info.get('data', {}).get('zannums', 0)
            read_num = json_info.get('data', {}).get('readnums', 0)
        mongodb = MongoDB()

        article_item = {'title': article_dict.get('title', ""), 'author': author,
                        'summary': article_dict.get('summary', ""),
                        'cover': article_dict.get('cover', ""), 'content': content, 'like_num': like_num,
                        'read_num': read_num,
                        'comment': "", 'url': url, 'receive_time': article_dict.get('receive_time', ""),
                        'account': article_dict.get('account', ""), '__biz': __biz}
        mongodb.add("wechat_article", article_item)
        try:
            download_pictures(dict_info=article_item, picture_urls=picture_urls)
            _thread.start_new_thread(download_videos, (article_item, '', video_urls))
        except:
            print("下载多媒体内容失败")
        sleep(60)
    pass
示例#17
0
def download(title=None):
    if title is not None:
        zhuishu = ZhuiShuSpider()
        # babadushu = BaBaDuShuSpider()
        # liewen = LieWenSpider()
        mongodb = MongoDB('zhuishu')
        # 想要爬取哪个网站,就把哪个网站的爬虫实现类传入,比如,我这里传入的是 追书网实例
        novel = spider.Spider(zhuishu, mongodb)
        novel.search(title)
    else:
        print('请输入要下载的小说名')
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time  and search_type = 702'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = {'result_list': result_list}
    # parser_params = []
    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_user begin **********')
        mongodb.delete('WWA_weibo_user_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'name': 'str_name',
            'sex': 'int_sex',
            'summary': 'str_summary',
            'fans_count': 'int_fans_count',
            'blog_verified': 'str_blog_verified',
            'is_verified': 'int_is_verified',
            'account_url': 'str_url',
            'follow_count': 'int_follow_count',
            'image_url': 'str_image_url',
            'monitor_status': 'vint_401',
            'SEARCH_TYPE' : 'vint_702',
            'region' : 'str_area'
        }

        export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url')
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_user end **********')

    # 配置spider
    spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info',
                    parser_count = 1, begin_callback = begin_callback, end_callback = end_callback,
                    parser_params = parser_params)

    # 添加parser
    spider.add_parser(weibo_user_parser)
    spider.start()
示例#19
0
class ImplicateController(object):
    _mongodb = MongoDB()

    def __init__(self):

        pass

    def cal_implicate_degree(self):
        peopleList = self._mongodb.get_information_set_with_sex_flag(-1)
        collection = self._mongodb.getDB()["Implicate"]

        for person in peopleList:
            print "current" + person["_id"]
            #初始化标签
            total, higherThan50, lessThan50 = 0, 0, 0
            #获取数据
            tweetsList = self._mongodb.get_tweets_by_id(person["_id"])
            #没有tweets做的处理
            if len(tweetsList) == 0:
                continue

            for tweet in tweetsList:
                degree = items.cal_implicit_degree(tweet["Content"])
                if degree > 50:
                    higherThan50 += 1
                else:
                    lessThan50 += 1
                total += degree
            avg = total / float(len(tweetsList))
            sex = ""
            try:
                if person["Gender"]:
                    sex = person["Gender"]
                else:
                    sex = "空"
            except:
                print "dont have sex tag"

            data = {
                "_id": person["_id"],
                "implicit_degree": avg,
                "tweets_count": len(tweetsList),
                "high_50": higherThan50,
                "less_50": lessThan50,
                "sex": sex
            }
            collection.insert(data)
示例#20
0
class UtMongoDB(unittest.TestCase):
    '''Unit Test driver for db.mongodb.MongoDB'''

    db = MongoDB('esm')

    def tearDown(self):
        "Delete seed data from testing database"
        try:
            connection = Connection(host="localhost", port=27017)
        except ConnectionFailure, e:
            sys.stderr.write("Could not connect to MongoDB: %s" % e)
            sys.exit(1)

        db_handler = connection["esm"]
        assert db_handler.connection == connection
        db_handler['justniffer_events'].drop()
        connection.end_request()
def main():
    db = MongoDB()

    def begin_callback():
        log.info('\n********** template begin **********')
        db.delete('op_urls', {})
        db.delete('op_content_info', {})

    def end_callback():
        log.info('\n********** template end **********')

        # 更新任务状态 done

        # 导出数据
        # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '')
        # export_data.export_to_oracle()

    # 配置spider
    spider = Spider(tab_urls='op_urls',
                    tab_site='op_site_info',
                    tab_content='op_content_info',
                    parser_count=20,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params={})

    #添加parser
    spider.add_parser(luzhou_parser)
    spider.add_parser(longmatan_parser)
    spider.add_parser(naxi_parser)
    spider.add_parser(luxian_parser)
    spider.add_parser(hejiang_parser)
    spider.add_parser(gulin_parser)
    spider.add_parser(luzhouzhiye_parser)
    spider.add_parser(sichuanhuagong_parser)
    spider.add_parser(luzhougaozhong_parser)
    spider.add_parser(xuyong_parser)
    spider.add_parser(jiangyang_parser)
    spider.add_parser(luzhoutianli_parser)
    spider.add_parser(sichuanluxian_parser)
    spider.add_parser(sichuan_police_parser)
    spider.add_parser(sichuanyikeda_parser)
    spider.add_parser(luzhoubaidu_parser)
    spider.start()
def main():
    db = MongoDB()
    oracle = OracleDB()

    def begin_callback():
        #db.update('WWA_app_urls',{'depth':0}, {'status':0})
        db.delete('WWA_search_app_urls')
        log.info('\n********** wwa begin **********')

    def end_callback():
        log.info('\n********** wwa end **********')
        export_data.main()

    keywords = []

    result_list = oracle.find(
        'select keyword from TAB_MVMS_SEARCH_INFO where  MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703'
    )
    if not result_list:
        log.debug('无任务 结束')
        return

    keywords = []
    for result in result_list:
        keywords.extend(result[0].split(','))

    parser_params = {'keywords': keywords}

    # 配置spider
    spider = Spider(tab_urls='WWA_search_app_urls',
                    tab_site='WWA_search_app_site_info',
                    tab_content='WWA_search_app_content_info',
                    content_unique_key='title',
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(yingyongbao_parser)
    spider.add_parser(android_market_parser)
    spider.add_parser(baidu_mobile_assistant_parser)
    spider.add_parser(mobile360_assistant_parser)
    spider.start()
示例#23
0
class RecipeRepository():
    recipe_collection = MongoDB.get_collection('HelloFresh', 'Recipes')
    '''
    Provides access to the recipes collection
    '''
    @classmethod
    async def create_recipe(cls, new_recipe: RecipeRequest) -> str:
        recipe_doc = new_recipe.dict(exclude={'_id'})
        result = await cls.recipe_collection.insert_one(recipe_doc)
        if not result.acknowledged:
            # TODO:  define custom exception
            raise Exception('Database did not acknowledge')
        return str(result.inserted_id)

    @classmethod
    async def read_recipe_by_id(cls, recipe_id) -> RecipeDB:
        recipe_document = await cls.recipe_collection.find_one({"_id": recipe_id})
        if recipe_document is None:
            raise ValueError('Invalid Recipe ID')
        return RecipeDB.parse_obj(recipe_document)

    @classmethod
    async def update_recipe(cls, recipe_id: str, new_recipe: RecipeRequest) -> str:
        recipe_doc = new_recipe.dict(exclude={'_id', 'id'}, by_alias=True) 
        result = await cls.recipe_collection.replace_one({"_id": ObjectId(recipe_id)}, recipe_doc)
        if not result.acknowledged:
            # TODO:  define custom exception
            raise Exception('Database did not acknowledge')
        elif result.modified_count != 1:
            raise Exception('Document was not updated')
        return recipe_id

    @classmethod
    async def delete_recipe(cls, recipe_id: str):
        result = await cls.recipe_collection.delete_one({"_id": ObjectId(recipe_id)})
        if not result.acknowledged:
            # TODO:  define custom exception
            raise Exception('Database did not acknowledge')
        elif result.deleted_count != 1:
            raise Exception('Document was not deleted')
        return
示例#24
0
class UtMongoQuery(unittest.TestCase):
    '''complex query test driver'''
    
    logger = Logger().getLogger("test.UtMongoQuery")
    db = MongoDB('esm')
    
    
    def setUp(self):
#        self.logger.debug('UtMongoQuery.setUp()')
        current_path = os.getcwd()
        self.runBash(current_path+'/import.sh')

    def tearDown(self):
#        self.logger.debug('UtMongoQuery.tearDown()')
        self.db.removeAll('events')
    
    def runBash(self,file_name):
        '''run a shell script.'''
        try:
            subprocess.call([file_name], shell=True)
        except OSError, e:
            self.logger.exception("bash script "+file_name+" execution failed:" + e)
def main():
    db = MongoDB()
    db.set_unique_key('WWA_app_vioation_content_info', 'url')
    db.set_ensure_index('WWA_app_vioation_content_info', 'read_status')

    def begin_callback():
        log.info('\n********** WWA_APP begin **********')
        db.delete('WWA_app_urls', {})

    def end_callback():
        export_data.main()
        log.info('\n********** WWA_APP end **********')


    # 配置spider
    spider = Spider(tab_urls = 'WWA_app_urls', tab_site = 'WWA_app_site_info', tab_content = 'WWA_app_content_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = {})

    # 添加parser
    spider.add_parser(headline_parser)
    spider.add_parser(kuaibao_parser)

    spider.start()
示例#26
0
def main():
    db = OracleDB()
    mongodb = MongoDB()

    sql = 'select t.ID, t.monitor_type from TAB_MVMS_WEIBO_INFO t where monitor_status = 402'
    result_list = db.find(sql, fetch_one=False)
    if not result_list:
        log.debug('无任务 结束')
        return

    parser_params = result_list

    # for i in result:
    #     parser_params.extend(str(i[0]).split(','))

    def begin_callback():
        log.info('\n********** WWA_weibo_info begin **********')
        mongodb.delete('WWA_weibo_info_urls')

    def end_callback():
        # 导出数据
        key_map = {
            'id': 'int__id',
            'release_time': 'date_release_time',
            'come_from': 'str_come_from',
            'content': 'clob_content',
            'image_url': 'str_image_url',
            'video_url': 'str_video_url',
            'transpond_count': 'int_transpond_count',
            'praise_count': 'int_praise_count',
            'check_status': 'vint_301',
            'weibo_id': 'int_weibo_id',
            'article_url': 'str_url',
            'violate_status': 'int_violate_id',
            'sensitive_id': 'int_sensitive_id',
            'record_time': 'date_record_time',
            'SEXY_IMAGE_STATUS': 'str_sexy_image_status'
        }

        export = ExportData('WWA_weibo_info_info',
                            'tab_mvms_weibo_article_info',
                            key_map,
                            unique_key='ARTICLE_url',
                            condition={
                                'read_status': 0,
                                "image_pron_status": 2
                            })
        export.export_to_oracle()
        log.info('\n********** WWA_weibo_info end **********')

    # 配置spider
    spider = Spider(tab_urls='WWA_weibo_info_urls',
                    tab_site='WWA_site_info',
                    tab_content='WWA_weibo_info_info',
                    parser_count=1,
                    begin_callback=begin_callback,
                    end_callback=end_callback,
                    parser_params=parser_params)

    # 添加parser
    spider.add_parser(weibo_info_parser)
    spider.start()
示例#27
0
 def begin_callback():
     log.info('\n********** WWA_wechat_article begin **********')
     db = MongoDB()
     db.delete('WWA_wechat_article_url', {})
示例#28
0
class Spider(threading.Thread):
    def __init__(self,
                 tab_list,
                 tab_unique_key_list,
                 tab_ensure_index_list,
                 parser_count=None,
                 site_parsers=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的参数
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_list[0]
        self._site_parsers = site_parsers

        self._db = MongoDB()
        for tab_index in range(len(tab_list)):
            self._db.set_unique_key(tab_list[tab_index],
                                    tab_unique_key_list[tab_index])
            # 设置索引 加快查询速度
            for ensure_index in tab_ensure_index_list[tab_index]:
                self._db.set_ensure_index(tab_list[tab_index], ensure_index)

        self._collector = Collector(self._tab_urls, self._site_parsers)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')

    def add_parser(self, parser):
        if self._spider_site_name[0] == 'all':
            for except_site_name in self._except_site_name:
                if parser.NAME != except_site_name.strip():
                    self._parsers.append(parser)
        else:
            for spider_site_name in self._spider_site_name:
                if parser.NAME == spider_site_name.strip():
                    self._parsers.append(parser)

    def run(self):
        self.__start()

    def __start(self):
        if self._begin_callback:
            self._begin_callback()

        if not self._parsers:
            if self._end_callabck:
                self._end_callabck()
            return

        # 启动parser 的add site 和 add root
        # print(self._parser_params)
        for parser in self._parsers:
            parser.add_site_info()
            parser.add_root_url(self._parser_params)
        print('添加跟url完毕')

        # 启动collector
        self._collector.add_finished_callback(self._end_callabck)
        self._collector.start()

        # 启动parser control
        while self._parser_count:
            parser_control = PaserControl(self._collector, self._tab_urls)

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_count -= 1
class Collector(threading.Thread):
    def __init__(self, tab_urls):
        super(Collector, self).__init__()
        self._lock = threading.RLock()

        self._db = MongoDB()
        self._thread_stop = False
        self._urls = []
        self._null_times = 0
        self._read_pos = -1
        self._write_pos = -1
        self._tab_urls = tab_urls
        self._depth = int(
            tools.get_conf_value('config.conf', "collector", "depth"))
        self._max_size = int(
            tools.get_conf_value('config.conf', "collector", "max_size"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        #初始时将正在做的任务至为未做
        self._db.update(self._tab_urls, {'status': Constance.DOING},
                        {'status': Constance.TODO})

        self._finished_callback = None

    def run(self):
        while not self._thread_stop:
            self.__input_data()
            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True

        if self._finished_callback:
            self._finished_callback()

    @tools.log_function_time
    def __input_data(self):
        log.debug('read_pos %d, write_pos %d buffer size %d' %
                  (self._read_pos, self._write_pos, self.get_max_read_size()))
        log.debug('buffer can write size = %d' % self.get_max_write_size())
        if self.get_max_write_size() == 0:
            log.debug("collector 已满 size = %d" % self.get_max_read_size())
            return

        url_count = self._url_count if self._url_count <= self.get_max_write_size(
        ) else self.get_max_write_size()

        urls_list = []
        if self._depth:
            urls_list = self._db.find(self._tab_urls, {
                "status": Constance.TODO,
                "depth": {
                    "$lte": self._depth
                }
            },
                                      limit=url_count)
        else:
            urls_list = self._db.find(self._tab_urls,
                                      {"status": Constance.TODO},
                                      limit=url_count)

        #更新已取到的url状态为doing
        for url in urls_list:
            self._db.update(self._tab_urls, url, {'status': Constance.DOING})

        # 存url
        self.put_urls(urls_list)

        if self.is_all_have_done():
            self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        if self.get_max_read_size() == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times:
                #检查数据库中有没有正在做的url
                urls_doing = self._db.find(self._tab_urls,
                                           {'status': Constance.DOING})
                if urls_doing:
                    self._null_times = 0
                    return False
                else:
                    return True
        else:
            self._null_times = 0
            return False

    def get_max_write_size(self):
        size = 0
        if self._read_pos == self._write_pos:
            size = self._max_size
        elif self._read_pos < self._write_pos:
            size = self._max_size - (self._write_pos - self._read_pos)
        else:
            size = self._read_pos - self._write_pos

        return size - 1

    def get_max_read_size(self):
        return self._max_size - 1 - self.get_max_write_size()

    @tools.log_function_time
    def put_urls(self, urls_list):
        if urls_list == []:
            return

        # 添加url 到 _urls
        url_count = len((urls_list))
        end_pos = url_count + self._write_pos + 1
        # 判断是否超出队列容量 超出的话超出的部分需要从头写
        # 超出部分
        overflow_end_pos = end_pos - self._max_size
        # 没超出部分
        in_pos = end_pos if end_pos <= self._max_size else self._max_size

        # 没超出部分的数量
        urls_listCutPos = in_pos - self._write_pos - 1

        self._lock.acquire()  #加锁

        self._urls[self._write_pos + 1:in_pos] = urls_list[:urls_listCutPos]
        if overflow_end_pos > 0:
            self._urls[:overflow_end_pos] = urls_list[urls_listCutPos:]

        self._lock.release()

        self._write_pos += url_count
        self._write_pos %= self._max_size  # -1 取余时问题  -1 % 1000 = 999  这样can write size 为0 urls_list为空时返回 规避了这个问题

    @tools.log_function_time
    def get_urls(self, count):
        self._lock.acquire()  #加锁
        urls = []

        count = count if count <= self.get_max_read_size(
        ) else self.get_max_read_size()
        end_pos = self._read_pos + count + 1
        if end_pos > self._max_size:
            urls.extend(self._urls[self._read_pos + 1:])
            urls.extend(self._urls[:end_pos % self._max_size])
        else:
            urls.extend(self._urls[self._read_pos + 1:end_pos])

        if urls:
            self._read_pos += len(urls)
            self._read_pos %= self._max_size

        self._lock.release()

        return urls
示例#30
0
class Spider(threading.Thread):
    def __init__(self,
                 tab_urls,
                 tab_site,
                 tab_content,
                 parser_count=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key='url'):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        self._db.set_unique_key(tab_urls, 'url')
        self._db.set_unique_key(tab_site, 'site_id')
        self._db.set_unique_key(tab_content, content_unique_key)

        #设置索引 加快查询速度
        self._db.set_ensure_index(tab_urls, 'depth')
        self._db.set_ensure_index(tab_urls, 'status')
        self._db.set_ensure_index(tab_site, 'read_status')
        self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')

    def add_parser(self, parser):
        if self._spider_site_name[0] == 'all':
            for except_site_name in self._except_site_name:
                if parser.NAME != except_site_name.strip():
                    self._parsers.append(parser)
        else:
            for spider_site_name in self._spider_site_name:
                if parser.NAME == spider_site_name.strip():
                    self._parsers.append(parser)

    def run(self):
        self.__start()

    def __start(self):
        if self._begin_callback:
            self._begin_callback()

        if not self._parsers:
            if self._end_callabck:
                self._end_callabck()
            return

        # 启动collector
        self._collector.add_finished_callback(self._end_callabck)
        self._collector.start()
        # 启动parser 的add site 和 add root
        #print(self._parser_params)
        for parser in self._parsers:
            threading.Thread(target=parser.add_site_info).start()
            threading.Thread(target=parser.add_root_url,
                             args=(self._parser_params, )).start()
        # 启动parser control
        while self._parser_count:
            parser_control = PaserControl(self._collector, self._tab_urls)

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_count -= 1
示例#31
0
import base.constance as Constance
import utils.tools as tools
from utils.log import log
from db.mongodb import MongoDB

db = MongoDB()


def remove_table(tab_list):
    for tab in tab_list:
        db.delete(tab)


def reset_table(tab_list):
    for tab in tab_list:
        db.update(tab, {'status': 3}, {'status': 0})


def add_url(table,
            site_id='',
            url='',
            depth=0,
            remark='',
            status=Constance.TODO,
            title='',
            origin='',
            domain='',
            retrieval_layer=0,
            image_url='',
            release_time=''):
    url_dict = {
示例#32
0
 def begin_callback():
     log.info('\n********** WWA_wechat_account begin **********')
     db = MongoDB()
     db.delete('WWA_wechat_account_url', {})