def feature_extraction(csvfile, datapath, resultcsv, fe_path=config.path_fe_space): fe = FeatureExtract(fe_path) fe.print_featuremap() # feature extraction out = open(resultcsv, "w") line = "%s,%s,%s" % ('id', 'label', fe.str_featuremap_line()) # print line out.write(line + "\n") out.flush() with open(csvfile, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') header = True for row in reader: if not header: item = UrlItem() ct = open(datapath + "/" + row[0], "r").read() item['url'] = row[1] item['content'] = ct item['title'] = item.get_short_title() f = fe.extract_item(item) line = "%s,%s,%s" % (row[0], row[2], FeatureExtract.str_feature(f)) print line out.write(line + "\n") out.flush() else: header = False out.close()
def feature_extraction(csvfile, datapath, resultcsv,fe_path=config.path_fe_space): fe = FeatureExtract(fe_path) fe.print_featuremap() # feature extraction out = open(resultcsv, "w") line = "%s,%s,%s" % ('id', 'label', fe.str_featuremap_line()) # print line out.write(line+"\n") out.flush() with open(csvfile, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') header = True for row in reader: if not header: item = UrlItem() ct = open(datapath + "/" + row[0], "r").read() item['url'] = row[1] item['content'] = ct item['title'] = item.get_short_title() f = fe.extract_item(item) line = "%s,%s,%s" % (row[0], row[2], FeatureExtract.str_feature(f)) print line out.write(line+"\n") out.flush() else: header = False out.close()
def parse_item(response): # response are all updated or new # db not changed # is this url in URL_LIB item = UrlItem.load_with_content(url=response.url, response=response) logging.info("PC get page [%s]:- %s" % (item['id'], item['url'])) yield item
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content( id=item_id,file_path=config.path_judge_inbox) feature = self.__fe.extract_item(item) if 'decision' not in data_loaded.keys(): decision, confidence = self.__auto_judge(feature) log.info("[%s]: [%s] # %s # %s%%" % (item_id, FeatureExtract.str_feature(feature), decision, confidence)) else: decision, confidence = data_loaded['decision'],100 log.info("[%s]: back from Extractor # %s # %s%%" % (item_id, decision, confidence)) self.__relearn_clf(feature,decision) if confidence > config.const_CONFIDENCE_THRESHOLD: item['is_target'] = decision item.save() if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]: self.__send_to_extractor(item) else: os.remove(config.path_judge_inbox + "/%s" % item.filename()) else: item['is_target'] = config.const_IS_TARGET_UNKNOW item.save() self.__judge_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "confidence": round(confidence,2), "decision": decision, "feature": feature } pass
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content( id=item_id,file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item,extractor) return extractor = config.const_RULE_UNKNOW self.__ext_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "decision": item['is_target'], "extractor": extractor } log.info("[%s]: # %s " % (item_id, extractor)) pass
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item, extractor) return extractor = config.const_RULE_UNKNOW self.__ext_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "decision": item['is_target'], "extractor": extractor } log.info("[%s]: # %s " % (item_id, extractor)) pass
def __op_rejudge_done(self, data_loaded, connection): item_id = int(data_loaded['id']) decision = int(data_loaded['decision']) item = UrlItem.load(id=item_id) del self.__ext_queue[item_id] self.__send_back_to_judge(item, decision) tool.send_msg(connection, "0") pass
def __op_add_extractor(self, data_loaded, connection): item_id = int(data_loaded['id']) extractor = data_loaded['extractor'] item = UrlItem.load_with_content(item_id, file_path=config.path_extractor_inbox) self.__extract(item, extractor) del self.__ext_queue[item['id']] tool.send_msg(connection, "0") pass
def __op_test_rule(self, data_loaded, connection): item_id = int(data_loaded['id']) rule = data_loaded['rule'] attrid = int(data_loaded['attrid']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_extractor_inbox) tool.send_msg( connection, self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid)) pass
def __op_test_rule(self, data_loaded, connection): item_id = int(data_loaded['id']) rule = data_loaded['rule'] attrid = int(data_loaded['attrid']) item = UrlItem.load_with_content( id=item_id, file_path=config.path_extractor_inbox) tool.send_msg( connection, self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid) ) pass
def __op_add_extractor(self, data_loaded, connection): item_id = int(data_loaded['id']) extractor = data_loaded['extractor'] item = UrlItem.load_with_content( item_id, file_path=config.path_extractor_inbox) self.__extract(item,extractor) del self.__ext_queue[item['id']] tool.send_msg( connection, "0" ) pass
def __op_preview(self, data_loaded, connection): log.info(data_loaded['extractor']) if data_loaded['extractor'] == config.const_RULE_UNKNOW: result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)} else: item_id = int(data_loaded['id']) item = UrlItem.load_with_content( item_id, file_path=config.path_extractor_inbox) extractor = data_loaded['extractor'] result = self.__ie.extract(item, extractor) preview = list() for att, str in result.iteritems(): preview.insert(att, dict(name=self.__ie.name(att), value=str)) log.info(preview) tool.send_msg(connection, pickle.dumps(preview, -1)) pass
def __refresh_list(self): delete_ids = [] for key, ent in self.__judge_queue.iteritems(): decision, confidence = self.__auto_judge(ent['feature']) if confidence > config.const_CONFIDENCE_THRESHOLD: item = UrlItem.load(id=key) item['is_target'] = decision item.save() delete_ids.append(key) if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]: self.__send_to_extractor(item) else: os.remove(config.path_judge_inbox + "/%s" % ent['filename']) else: self.__judge_queue[key]['confidence'] = confidence self.__judge_queue[key]['decision'] = decision for ent_id in delete_ids: del self.__judge_queue[ent_id]
def __op_done(self, data_loaded, connection): item_id = int(data_loaded['id']) decision = int(data_loaded['decision']) item = UrlItem.load(id=item_id) item['is_target'] = decision item.save() if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]: self.__send_to_extractor(item) else: os.remove(config.path_judge_inbox + "/%s" % item.filename()) self.__relearn_clf(self.__judge_queue[item_id]['feature'],decision) del self.__judge_queue[item_id] tool.send_msg(connection, "0") pass
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_judge_inbox) feature = self.__fe.extract_item(item) if 'decision' not in data_loaded.keys(): decision, confidence = self.__auto_judge(feature) log.info("[%s]: [%s] # %s # %s%%" % (item_id, FeatureExtract.str_feature(feature), decision, confidence)) else: decision, confidence = data_loaded['decision'], 100 log.info("[%s]: back from Extractor # %s # %s%%" % (item_id, decision, confidence)) self.__relearn_clf(feature, decision) if confidence > config.const_CONFIDENCE_THRESHOLD: # pretty sure, save to db, and pass to extract item['is_target'] = decision item.save() if int(item['is_target']) in [ config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE ]: # item is target self.__send_to_extractor(item) else: # item is not target os.remove(config.path_judge_inbox + "/%s" % item.filename()) else: # not sure, put it in queue, involving human-being item['is_target'] = config.const_IS_TARGET_UNKNOW item.save() self.__judge_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "confidence": round(confidence, 2), "decision": decision, "feature": feature } pass
def __op_refresh(self, data_loaded, connection): delete_ids = [] for key, ent in self.__ext_queue.iteritems(): item_id = int(key) item = UrlItem.load_with_content( id=item_id, file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item, extractor) delete_ids.append(item_id) # clear delete_ids for ent_id in delete_ids: del self.__ext_queue[ent_id]
def __op_done(self, data_loaded, connection): item_id = int(data_loaded['id']) decision = int(data_loaded['decision']) item = UrlItem.load(id=item_id) item['is_target'] = decision item.save() if int(item['is_target']) in [ config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE ]: # item is target self.__send_to_extractor(item) else: # item is not target os.remove(config.path_judge_inbox + "/%s" % item.filename()) self.__relearn_clf(self.__judge_queue[item_id]['feature'], decision) del self.__judge_queue[item_id] tool.send_msg(connection, "0") pass
def __op_refresh(self,data_loaded,connection): delete_ids = [] for key, ent in self.__ext_queue.iteritems(): item_id = int(key) if not os.path.isfile( config.path_extractor_inbox+"/"+str(item_id)+".html"): delete_ids.append(item_id) continue item = UrlItem.load_with_content( id=item_id,file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item,extractor) delete_ids.append(item_id) for ent_id in delete_ids: del self.__ext_queue[ent_id]
def __refresh_list(self): delete_ids = [] for key, ent in self.__judge_queue.iteritems(): decision, confidence = self.__auto_judge(ent['feature']) if confidence > config.const_CONFIDENCE_THRESHOLD: # pretty sure, save to db, and pass to extract item = UrlItem.load(id=key) item['is_target'] = decision item.save() delete_ids.append(key) if int(item['is_target']) in [ config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE ]: self.__send_to_extractor(item) else: os.remove(config.path_judge_inbox + "/%s" % ent['filename']) else: self.__judge_queue[key]['confidence'] = confidence self.__judge_queue[key]['decision'] = decision # clear delete_ids for ent_id in delete_ids: del self.__judge_queue[ent_id]
__author__ = 'LeoDong' import requests from util import config from bs4 import BeautifulSoup from extractor.InfoExtractor import InfoExtractor from SAECrawlers.items import UrlItem import json ie = InfoExtractor(config.path_extract_onto+"/seminar.xml",config.path_extract_onto) item = UrlItem.load_with_content(id=1,file_path=config.path_judge_inbox) # print item.get_part('soup').prettify() # print item.get_part('content') rule={ "on": "content", "scope": { "sel":"section#visible-body .logo", "target":"text" }, "description": "url", "actions": [ 2 ], "substring": { "after": "H", "before": "" }, } print ie.extract_attr(item,rule_id_or_dict=rule)
__author__ = 'LeoDong' import requests from util import config from bs4 import BeautifulSoup from extractor.InfoExtractor import InfoExtractor from SAECrawlers.items import UrlItem import json ie = InfoExtractor(config.path_extract_onto + "/seminar.xml", config.path_extract_onto) item = UrlItem.load_with_content(id=1, file_path=config.path_judge_inbox) # print item.get_part('soup').prettify() # print item.get_part('content') rule = { "on": "content", "scope": { "sel": "section#visible-body .logo", "target": "text" }, "description": "url", "actions": [2], "substring": { "after": "H", "before": "" }, } print ie.extract_attr(item, rule_id_or_dict=rule) # print json.dumps(ie.map(1), indent=2)
def parse(self, response): item = UrlItem.load_with_content(url=response.url, response=response) logging.debug("Updater get page [%s]:- %s" % (item['id'], item['url'])) yield item