def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content( id=item_id,file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item,extractor) return extractor = config.const_RULE_UNKNOW self.__ext_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "decision": item['is_target'], "extractor": extractor } log.info("[%s]: # %s " % (item_id, extractor)) pass
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item, extractor) return extractor = config.const_RULE_UNKNOW self.__ext_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "decision": item['is_target'], "extractor": extractor } log.info("[%s]: # %s " % (item_id, extractor)) pass
def __op_refresh(self, data_loaded, connection): delete_ids = [] for key, ent in self.__ext_queue.iteritems(): item_id = int(key) item = UrlItem.load_with_content( id=item_id, file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item, extractor) delete_ids.append(item_id) # clear delete_ids for ent_id in delete_ids: del self.__ext_queue[ent_id]
def load(id=None, url=None): r = UrlItem() if id is not None: res = db.get_url_by_id(id) elif url is not None: res = db.get_url_by_url(url) else: raise Exception("must provide id or url") if res is None: return None r['id'] = res['id'] r['url'] = res['url'] r['is_target'] = res['is_target'] r['content_hash'] = res['content_hash'] r['layout_hash'] = res['layout_hash'] r['extractor'] = tool.str2extractor(res['extractor']) r['last_access_ts'] = res['last_access_ts'] r['last_extract_ts'] = res['last_extract_ts'] r['title'] = res['title'] r['content_type'] = res['content_type'] return r
def __op_refresh(self,data_loaded,connection): delete_ids = [] for key, ent in self.__ext_queue.iteritems(): item_id = int(key) if not os.path.isfile( config.path_extractor_inbox+"/"+str(item_id)+".html"): delete_ids.append(item_id) continue item = UrlItem.load_with_content( id=item_id,file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item,extractor) delete_ids.append(item_id) for ent_id in delete_ids: del self.__ext_queue[ent_id]