예제 #1
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item,extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass
예제 #2
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item, extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass
예제 #3
0
 def __op_refresh(self, data_loaded, connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         item = UrlItem.load_with_content(
             id=item_id, file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item, extractor)
                 delete_ids.append(item_id)
     # clear delete_ids
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]
예제 #4
0
 def load(id=None, url=None):
     r = UrlItem()
     if id is not None:
         res = db.get_url_by_id(id)
     elif url is not None:
         res = db.get_url_by_url(url)
     else:
         raise Exception("must provide id or url")
     if res is None:
         return None
     r['id'] = res['id']
     r['url'] = res['url']
     r['is_target'] = res['is_target']
     r['content_hash'] = res['content_hash']
     r['layout_hash'] = res['layout_hash']
     r['extractor'] = tool.str2extractor(res['extractor'])
     r['last_access_ts'] = res['last_access_ts']
     r['last_extract_ts'] = res['last_extract_ts']
     r['title'] = res['title']
     r['content_type'] = res['content_type']
     return r
예제 #5
0
 def __op_refresh(self,data_loaded,connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         if not os.path.isfile(
                 config.path_extractor_inbox+"/"+str(item_id)+".html"):
             delete_ids.append(item_id)
             continue
         item = UrlItem.load_with_content(
             id=item_id,file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item,extractor)
                 delete_ids.append(item_id)
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]