def __init__(self, workdir, language, sourcefile, testfile): self._path = path.dirname(path.abspath(__file__)) self.workdir = workdir self.language = language self.sourcefile = sourcefile self.testfile = testfile self.tmpltdir = path.join(self._path, "templates") self.judger = Judger()
def lambda_handler(event, context): print(event) env = event.get('env') twitter = Twitter(env=env) trends = twitter.get_trends(id=WOEID) judger = Judger() for trend in trends: should_tweet = judger.judge_whether_tweet(trend) if should_tweet: twitter.post_tweet(trend.get('name')) time.sleep(SLEEP_TIME)
def play(policy_number): player1 = HumanPlayer() player2 = Player(epsilon=0, symbol=-1) player2.load_policy(policy_number) while True: judger = Judger(player1, player2) winner = judger.play() if winner == player2.symbol: print("You lose!") elif winner == player1.symbol: print("You win!") else: print("It is a tie!")
def train(epochs, print_every_n=500): file = open('app/saves/metrics_all.csv', "w") with file: writer = csv.writer(file) writer.writerow(['win_rate1', 'win_rate2', 'draw_rate']) file = open('app/saves/metrics_first.csv', "w") with file: writer = csv.writer(file) writer.writerow(['td_error']) file = open('app/saves/metrics_second.csv', "w") with file: writer = csv.writer(file) writer.writerow(['td_error']) epsilon = 1 epsilon_decay = 0.999 epsilon_min = 0.01 player1 = Player(epsilon=epsilon, symbol=1) player2 = Player(epsilon=epsilon, symbol=-1) judger = Judger(player1, player2) player1_win = 0.0 player2_win = 0.0 for i in range(1, epochs + 1): winner = judger.play(train=True, print_state=False) if winner == 1: player1_win += 1 if winner == -1: player2_win += 1 win_rate1 = player1_win / i win_rate2 = player2_win / i draw_rate = (i - (player1_win + player2_win)) / i metrics_file = open('app/saves/metrics_all.csv', "a") with metrics_file: writer = csv.writer(metrics_file) writer.writerow([win_rate1, win_rate2, draw_rate]) if i % print_every_n == 0: print( 'Epoch %d, player 1 winrate: %.02f, player 2 winrate: %.02f, draw rate: %.02f' % (i, win_rate1, win_rate2, draw_rate)) player1.save_policy(i) player2.save_policy(i) epsilon = max(epsilon_min, epsilon * epsilon_decay) player1.set_epsilon(epsilon) player2.set_epsilon(epsilon)
def judge(): data = request.json logging.info(f"recieve{data}") submit_id = data['submit_id'] problem_id = data['problem_id'] logging.info(f"run problem id: {problem_id}") source = data['source'] judge_dir = os.path.join(TMP_DIR, str(submit_id)) # temp directory for running data_dir = os.path.join( BASE_DIR, str(problem_id)) # standard input output file, read only if os.path.exists(judge_dir): shutil.rmtree(judge_dir) os.makedirs(judge_dir) with open(os.path.join(judge_dir, data['src']), mode='w+', encoding='utf-8') as f: f.write(source) compiler = Compiler(data['compile_command'], judge_dir) spj = False if os.path.exists(os.path.join(data_dir, "spj")) or \ os.path.exists(os.path.join(data_dir, "spj.py")): spj = True judger = Judger(data['max_cpu_time'], data['max_memory'], data['run_command'], data.get('seccomp_rule'), judge_dir, 1 if data.get('memory_limit_check_only') else 0, data_dir, submit_id, spj) judge_pool.apply_async(run, (judger, compiler), callback=callback) return "success"
def compete(player1, turns, policy_number): player2 = Player(epsilon=0, symbol=-1) player2.load_policy(policy_number) judger = Judger(player1, player2) player1_win = 0.0 player2_win = 0.0 for _ in range(turns): winner = judger.play() if winner == 1: player1_win += 1 if winner == -1: player2_win += 1 draw_rate = (turns - (player1_win + player2_win)) / turns print( '%d turns, player 1 winrate: %.02f, player 2 winrate: %.02f, draw rate: %.02f' % (turns, player1_win / turns, player2_win / turns, draw_rate))
class Checker: def __init__(self, workdir, language, sourcefile, testfile): self._path = path.dirname(path.abspath(__file__)) self.workdir = workdir self.language = language self.sourcefile = sourcefile self.testfile = testfile self.tmpltdir = path.join(self._path, "templates") self.judger = Judger() @property def testcases(self): testpath = path.join(self.workdir, self.testfile) with open(testpath, "r") as f: return json.load(f) @property def languages(self): return listdir(self.tmpltdir) def _loadModule(self): tmpltpath = path.join(self.tmpltdir, self.language) loader = SourceFileLoader(self.language, tmpltpath) module = loader.load_module() module.workdir = self.workdir return module def check(self): if self.language not in self.languages: raise ValueError("Language %s is not supported" % self.language) module = self._loadModule() codepath = path.join(self.workdir, self.sourcefile) if zipfile.is_zipfile(codepath): with zipfile.ZipFile(codepath, "r") as zpf: zpf.extractall(self.workdir) self.sourcefile = zpf.namelist() self.judger.judge(module, self.sourcefile, self.testcases, timeout=3) return self.judger.result def _export_result(self, results): with open(path.join(self.workdir, "result.json"), "w") as f: json.dump(results, f)
def init_game(self): ''' Initialilze the game of Limit Texas Hold'em This version supports two-player limit texas hold'em Returns: (tuple): Tuple containing: (dict): The first state of the game (int): Current player's id ''' # Initilize a dealer that can deal cards self.dealer = Dealer() # Initilize two players to play the game self.players = [ Player(i, self.init_chips) for i in range(self.num_players) ] # Initialize a judger class which will decide who wins in the end self.judger = Judger() # Deal cards to each player to prepare for the first round for i in range(self.num_players): self.players[i].hand.append(self.dealer.deal_card()) # Initilize public cards self.public_cards = [] # Randomly choose a big blind and a small blind s = np.random.randint(0, self.num_players) b = (s + 1) % self.num_players self.players[b].in_chips = self.big_blind self.players[s].in_chips = self.small_blind # The player next to the small blind plays the first self.game_pointer = (b + 1) % self.num_players # Initilize a bidding round, in the first round, the big blind and the small blind needs to # be passed to the round for processing. self.round = Round(self.num_players, self.big_blind) self.round.start_new_round(game_pointer=self.game_pointer, raised=[p.in_chips for p in self.players]) # Count the round. There are 4 rounds in each game. self.round_counter = 0 # Save the hisory for stepping back to the last state. self.history = [] self.action_history = [] for i in range(2): self.action_history.append([]) state = self.get_state(self.game_pointer) return state, self.game_pointer
def __init__(self, name, url_queue, url_list, url_in_queue, Flock, home_urls ,tem_siteID = [0], continue_run = [True]): ''' name url_queue 从主服务器中分配到的url url_list 本地区分是否重复 url_in_queue 解析得到的新url 将为每一个站点分配一个 UrlQueue Flock home_urls 测试是否符合爬取集合 tem_conn 初始的DNS 缓存 is_new_task 通过引用传递 由communitor修改 以判断是否需要修改 tem_home_url old_home_url 引用传递 continue_run[] 是否继续运行的标志 ''' threading.Thread.__init__(self, name = name ) #本地测试url队列 如果在本地重复 则直接舍弃 #如果不重复 加入临时队列 将来传输到中央服务器进行测试 #为每个站点分配了一个list对象 分开进行url的分辨 self.__url_list = url_list self.__url_queue = url_queue #默认为每一个站点分配一个inqueue #本地临时记录队列 在url_list中测试不重复后 加入in_queue #在积累到一定量后 传输给中央服务器管理 #Queue() self.__url_in_queue = url_in_queue #---------------------------------------------------------------- self.__Flock = Flock self.__home_urls = home_urls #强制刷新 DNS self.__tem_siteID = None #引用传递 方便进行对照 self.__tem_siteID = tem_siteID #---------------------------------------------------------------- self.__Flock = Flock self.__htmlparser = HtmlParser() self.__picparser = PicParser() self.__judger = Judger(self.__home_urls) #init temporary home_url and siteID #both to determine weather to refresh DNS cache self.__dbsource = DBSource() self.__collector = Collector(home_urls) #continue run self.__continue_run = continue_run
def test_tweet_is_correct(): judger = Judger() # dummy blacklist class injection judger.blacklists = DummyBlacklists() for trend in sample_trends: should_tweet = judger.judge_whether_tweet(trend) if trend.get('name') == 'trend_A': assert should_tweet is False elif trend.get('name') == 'trend_B': assert should_tweet is False elif trend.get('name') == 'trend_C': assert should_tweet is True elif trend.get('name') == 'trend_D': assert should_tweet is True elif trend.get('name') == 'trend_E': assert should_tweet is True elif trend.get('name') == 'trend_F': assert should_tweet is False elif trend.get('name') == 'trend_blacklisted_A': assert should_tweet is False
import os import sys import time import logging sys.path.append('..') from judger import Judger strtime = time.strftime("%Y-%m-%d-%H-%M", time.localtime()) log_name = "./" + strtime + "ensemble.txt" logging.basicConfig(handlers=[logging.FileHandler(log_name, 'w+', 'utf-8')], format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) accusation_path = 'accu.txt' law_path = 'law.txt' judger = Judger(accusation_path, law_path) marked_labels_list = np.load('../accu/accu_labels.npy') scores_path = '../accu/' scores_names = [ 'accu_lstm89.1.npy', 'accu_gruaug88.3.npy', 'accu_gru87.9.npy', 'accu_grubigaug87.3.npy', 'accu_rcnn87.npy', 'accu_rcnnaug86.84.npy', 'accu_cnn86.77.npy', 'accu_fasttextaug76.14.npy' ] scores_name_num = len(scores_names) def sigmoid_(inputs): """ Calculate the sigmoid for the give inputs (array)
'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.00001, 0.000005, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), #'clf__n_iter': (10, 50, 80), } sgd_param_list = parse_params(SGD_parameters) law_model = None accu_model = None time_model = None ''' test_fold = np.zeros((train_docs_num + val_docs_num), dtype='int') test_fold[:train_docs_num] = -1 ps = PredefinedSplit(test_fold = test_fold) ''' judge = Judger("../baseline/accu.txt", "../baseline/law.txt") parameters = SVC_parameters param_list = parse_params(parameters) def pipeline_train(pid): clf = None param = None if clf_name == 'SVC': param = svc_param_list[pid] clf = LinearSVC(**param['clf']) elif clf_name == 'SGD': param = sgd_param_list[pid] clf = SGDClassifier(**param['clf']) elif clf_name == 'LR': param = lr_param_list[pid]
class Evaluator(object): def __init__(self, predictor, input_path='./input', output='./out'): self.predictor = predictor self.input_path = input_path self.output_path = output self.judger = Judger('./data/accu.txt', './data/law.txt') self.cnt = 0 def format_result(self, result): rex = {"accusation": [], "articles": [], "imprisonment": -3} res_acc = [] for x in result["accusation"]: if not (x is None): res_acc.append(int(x)) rex["accusation"] = res_acc if not (result["imprisonment"] is None): rex["imprisonment"] = int(result["imprisonment"]) else: rex["imprisonment"] = -3 res_art = [] for x in result["articles"]: if not (x is None): res_art.append(int(x)) rex["articles"] = res_art return rex def get_batch(self): v = self.predictor.batch_size if not (type(v) is int) or v <= 0: raise NotImplementedError return v def solve(self, fact): result = self.predictor.predict(fact) for a in range(0, len(result)): result[a] = self.format_result(result[a]) return result def output_result(self, file_name): inf = open(os.path.join(self.input_path, file_name), "r") ouf = open(os.path.join(self.output_path, file_name), "w") fact = [] for line in inf: fact.append(json.loads(line)["fact"]) if len(fact) == self.get_batch(): result = self.solve(fact) self.cnt += len(result) for x in result: print(json.dumps(x), file=ouf) fact = [] if len(fact) != 0: result = self.solve(fact) self.cnt += len(result) for x in result: print(json.dumps(x), file=ouf) fact = [] ouf.close() def scoring(self, file_name): result = self.judger.test(self.input_path, self.output_path, file_name) return self.judger.get_score(result)
class Reptile(threading.Thread): ''' 单个线程 ''' def __init__(self, name, url_queue, url_list, url_in_queue, Flock, home_urls ,tem_siteID = [0], continue_run = [True]): ''' name url_queue 从主服务器中分配到的url url_list 本地区分是否重复 url_in_queue 解析得到的新url 将为每一个站点分配一个 UrlQueue Flock home_urls 测试是否符合爬取集合 tem_conn 初始的DNS 缓存 is_new_task 通过引用传递 由communitor修改 以判断是否需要修改 tem_home_url old_home_url 引用传递 continue_run[] 是否继续运行的标志 ''' threading.Thread.__init__(self, name = name ) #本地测试url队列 如果在本地重复 则直接舍弃 #如果不重复 加入临时队列 将来传输到中央服务器进行测试 #为每个站点分配了一个list对象 分开进行url的分辨 self.__url_list = url_list self.__url_queue = url_queue #默认为每一个站点分配一个inqueue #本地临时记录队列 在url_list中测试不重复后 加入in_queue #在积累到一定量后 传输给中央服务器管理 #Queue() self.__url_in_queue = url_in_queue #---------------------------------------------------------------- self.__Flock = Flock self.__home_urls = home_urls #强制刷新 DNS self.__tem_siteID = None #引用传递 方便进行对照 self.__tem_siteID = tem_siteID #---------------------------------------------------------------- self.__Flock = Flock self.__htmlparser = HtmlParser() self.__picparser = PicParser() self.__judger = Judger(self.__home_urls) #init temporary home_url and siteID #both to determine weather to refresh DNS cache self.__dbsource = DBSource() self.__collector = Collector(home_urls) #continue run self.__continue_run = continue_run #------------------------------------------------------ @dec def init(self, siteID): console('self.init()') self.siteID = -1 self.__tem_siteID[0] = siteID self.__dbsource.init(siteID) self.__url_queue.init(siteID) netloc = self.transNetloc(self.__home_urls[siteID]) print 'get netloc',netloc self.__conn = httplib.HTTPConnection(netloc, 80, timeout = 10) @dec def conn(self): ''' 包含刷新DNS功能 siteID引用传入 检测DNS改变 ''' if self.siteID != self.__tem_siteID[0]: ''' 更新DNS ''' self.siteID = self.__tem_siteID[0] #netloc = (urlparse.urlsplit(self.__home_urls[self.__tem_siteID[0]])).netloc netloc = self.transNetloc(self.__home_urls[self.__tem_siteID[0]]) print 'netloc',netloc self.__conn = httplib.HTTPConnection(netloc, 80, timeout = 10) return self.__conn def transcode(self, source): ''' 转码 自动转化为utf8 ''' res = chardet.detect(source) confidence = res['confidence'] encoding = res['encoding'] p = re.compile("&#(\S+);") source = p.sub("",source) print 'transcode', res if encoding == 'utf-8': return source if confidence < 0.6: return False else: return unicode(source, encoding, 'ignore') @dec def transPath(self, page_url, path): ''' 将任意一个链接转化为 路径 ''' url = self.__judger.transToStdUrl(page_url, path) return urlparse.urlsplit(url).path @dec def transNetloc(self, url): ''' 传入绝对url ''' return urlparse.urlsplit(url).netloc #------------------------------------------------------------- @dec def run(self): ''' 运行主程序 ''' console('self.run()') self.conn() home_url = self.__home_urls[self.siteID] print 'home_url',home_url while(True): #从外界传入标志 是否继续运行 #实现中断或者停止 if not self.__continue_run[0]: return #[title, path] urlinfo = self.getAUrl() print 'get urlinfo ',urlinfo if not urlinfo: print "No Task\nqueue is empty!" return #全局 页面信息 page_path = urlinfo[1] page_url = self.__judger.transToStdUrl(home_url, page_path) print 'page_path',page_path source = self.getPage(home_url, page_path) #判断是否为html源码 if not self.__htmlparser.init(source): ''' 图片和其他文件单独处理 此处不作解析 ''' continue #取得绝对地址 #url = self.__judger.transToStdUrl(home_url, page_path) #url统一存储为绝对地址 #save html source print 'saveHtml'+'-'*200 self.saveHtml(page_url, urlinfo[0]) imgsrcs = self.getImgSrcs() #save images self.saveImgList(page_url, imgsrcs) newurls = self.__htmlparser.getALinkText_List() self.addNewInQueue(page_url, newurls) @dec def requestSource(self, path): ''' page_url 子页面 如 ./index.html url: 直接传入绝对url 包括home_url 内部进行解析 ''' conn = self.conn() conn.request("GET", path) #print self.__conn r1 = conn.getresponse() #print r1 print r1.status data = r1.read() ''' if r1.status != 'OK': print 'status is ',r1.status print 'status not OK' print r1.reason return False data = r1.read() if not len(data): print 'length of data is 0' return False ''' return data @dec def getPage(self,page_url, url): ''' 任意传入url 将自动转化为path 然后调用底层 requestSource() ''' console('self.getPage()') path = self.transPath(page_url, url) data = self.requestSource(path) print 'page_url: url',page_url, url if len(data): data = self.transcode(data) #print 'data',data if not len(data): return False if not self.__collector.init(data): print 'collector.init', return False #self.__htmlparser.init(data) self.__htmlparser = self.__collector.htmlparser return data @dec def getImg(self,page_url, url): ''' path img_path './img/1.jpg' 返回 [绝对url, source] ''' url = self.transPath(page_url, url) return [url, self.requestSource(url)] @dec def getAUrl(self): return self.__url_queue.get(timeout = 3) @dec def getUrls(self): ''' 取得urls 并且进行判断 ''' return self.__htmlparser.getALink_list() @dec def getImgSrcs(self): ''' parse html source and return src_list ''' return self.__htmlparser.getPicSrcs_List() @dec def addNewQueue(self, path_list): ''' 外界: 控制服务器传来的新的paths url_list = [ ['cau','path'], ] ''' #控制刷新 for url in path_list: self.__url_queue.put(url) @dec def addNewInQueue(self, page_url, url_list): ''' url直接为原始的url 不需要另外进行处理 将new_url添加到对应的queue中 ''' for urlinfo in url_list: #处理为绝对url url = self.__judger.transToStdUrl(page_url, urlinfo[1]) siteID = self.__judger.judgeUrl(page_url, url) path = urlparse.urlsplit(url).path #判断是否为本平台url if siteID != -1: if not self.__url_list.find(siteID, path): ''' not duplicate in url_list ''' #将url减少 self.__url_in_queue.put(siteID, urlinfo[0], path) self.__url_in_queue.show() @dec def saveHtml(self, url, title): ''' 存储 source 和 parsedsource to database ''' #得到绝对url assert self.siteID != -1 #url = self.__judger.transToStdUrl(self.__home_urls[self.siteID], path) today = datetime.date.today() info = { 'title' : title, 'url': url, 'date': datetime.date.isoformat(today) } self.__dbsource.saveHtml(info, self.__collector.html, self.__collector.transXml_Str(url)) def saveImg(self, url, source): imgsource = self.__picparser.getCompressedPic() size = imgsource['size'] source = imgsource['source'] #print 'source',source info = { 'url':url, 'width':size[0], 'height':size[1] } self.__dbsource.saveImg(info, source) def saveImgList(self, page_url, srcs): ''' 传入绝对src 传入 srcs 系列存储 ''' for src in srcs: imgsource = self.getImg(page_url, src) url = imgsource[0] source = imgsource[1] self.__picparser.init(source) self.saveImg(url, imgsource)
def test_tweet_volume_lager_than_threshold(): trend = {'name': 'dummy', 'tweet_volume': THRETHOLD + 1} judger = Judger() judger.blacklists = DummyBlacklists() assert judger.judge_whether_tweet(trend)
def test_tweet_volume_equals_threshold(): trend = {'name': 'dummy', 'tweet_volume': THRETHOLD} judger = Judger() judger.blacklists = DummyBlacklists() assert not judger.judge_whether_tweet(trend) del judger
def judge(args, ip): with InitIsolateEnv() as box_id: compile_config = languages[args['language_name']]['compile'] run_config = languages[args['language_name']]['run'] src_name = compile_config['src_name'] time_limit = args['time_limit'] / 1000.0 if args['language_name'] == 'java': memory_limit = 512 * 1024 else: memory_limit = args['memory_limit'] test_case_id = args['test_case_id'] submission_id = args['submission_id'] logger.exception(test_case_id) path = os.path.join(JUDGE_DEFAULT_PATH, str(box_id)) host_name = socket.gethostname() is_spj = True if 'spj_code' in args and args['spj_code'] else False # write source code into file try: src_path = os.path.join(path, 'box', src_name) f = open(src_path, "w") f.write(args['src_code'].encode("utf8")) f.close() except Exception as e: logger.exception(e) raise JudgeServerError('unable write code to file') # write spj code into file if is_spj: spj_src_path = os.path.join(path, 'box', 'spj.c') f = open(spj_src_path, "w") f.write(args['spj_code'].encode("utf8")) f.close() update_submission_status(ip, submission_id, 'compiling') # compile compiler = Compiler(compile_config=compile_config, box_id=box_id) compiler.compile() # compile spj code if is_spj: spj_config = languages['c++']['compile'] spj_config['src_name'] = 'spj.c' spj_config['exe_name'] = 'spj' spj_compiler = Compiler(compile_config=spj_config, box_id=box_id) spj_compiler.compile() update_submission_status(ip, submission_id, 'running & judging') # run judger = Judger(run_config=run_config, max_cpu_time=time_limit, max_memory=memory_limit, test_case_id=test_case_id, box_id=box_id, server_ip=ip, submission_id=submission_id, is_spj=is_spj) result = judger.run() judge_result = { "status": RESULT["accepted"], "info": result, "time": None, "memory": None, "server": host_name } for item in judge_result["info"]: if item["status"] != RESULT['accepted']: judge_result["status"] = item["status"] break else: st = sorted(result, key=lambda k: k['info']['time']) judge_result["time"] = st[-1]['info']["time"] * 1000 # TODO 我也不知道为啥除了10之后内存和实际相符 # 2017.04.06 update: # VSS - Virtual Set Size 虚拟耗用内存(包含共享库占用的内存) # RSS - Resident Set Size 实际使用物理内存(包含共享库占用的内存) # PSS - Proportional Set Size 实际使用的物理内存(比例分配共享库占用的内存) # USS - Unique Set Size 进程独自占用的物理内存(不包含共享库占用的内存) # 目前来看大概rss/10=uss # 经过测试 poj使用的是uss hdu使用的是rss judge_result["memory"] = st[-1]['info']["max-rss"] judge_result["status"] = RE_RESULT[judge_result["status"]] for item in judge_result["info"]: item["status"] = RE_RESULT[item["status"]] return judge_result
def __init__(self, homeurls): self.htmlparser = HtmlParser() self.judger = Judger(homeurls)
# pdb.set_trace() if set(one_tags) == set(predic_labels_names): all_qual_num = all_qual_num + 1 # pdb.set_trace() result_file.write( "true_count={},predict_count={},all_qual_num={}\n".format( true_tags_count, predic_tags_count, all_qual_num)) # pdb.set_trace() outf_path = '../output/' out_filename = "{}_output.json".format(task_type_name) outf_file = os.path.join(outf_path, out_filename) inf_path = os.path.join(labor_data_path, data_filename) generate_pred_file(labor_tags_list, labor_preds, inf_path, outf_file) # 对结果进行评估 judger_labor = Judger(tag_path=labor_tag_file) reslt_labor = judger_labor.test(truth_path=inf_path, output_path=outf_file) score_labor = judger_labor.gen_score(reslt_labor) result_file.write('score_{}={}\n\n'.format(model_filename, score_labor)) exit() # 生成divorce领域的预测文件 print('predict_divorce...') tags_list = [] with open('../../data/divorce/tags.txt', 'r', encoding='utf-8') as tagf: for line in tagf.readlines(): tags_list.append(line.strip()) prd = Predictor('model_divorce/') inf_path = '../../data/divorce/data_small_selected.json' outf_path = '../../output/divorce_output.json'
class Collector: ''' 从html中提取相关tag内容 并组合为一定格式 并进行存储 主要作用为转化为xml格式 ''' def __init__(self, homeurls): self.htmlparser = HtmlParser() self.judger = Judger(homeurls) def init(self, html): ''' 显式刷新缓存内容 ''' self.html=html self.html.replace('"',"'") self.html.replace("'", "''") if not self.htmlparser.init(html): return False self.d = self.htmlparser.d self.d=pq(html) self.d('script').remove() self.d('SCRIPT').remove() self.d('style').remove() self.d('STYLE').remove() print '-'*200 print self.html return True def clear_other_node(self): ''' 删除无用标签 ''' self.d('head').remove() self.d('h1').remove() self.d('h2').remove() self.d('h3').remove() self.d('b').remove() self.d('a').remove() def getTitleText(self): ''' 提取 title ''' return self.d('title').text() def getNodes(self,tag_name): ''' get a list of certain tag nodes ''' return self.d(tag_name) def __xmlAppendNodesTextList(self, xmlnode, tagname): ''' xml节点为list中每个元素添加记录 注意需要提前将链接化为 绝对链接 如 <b> <item>hello</item> <item>world</item> </b> ''' html_node_text_list = self.d(tagname) print html_node_text_list childnode = self.dd.createElement(tagname) print childnode for i in range(len(html_node_text_list)): ''' 为每个元素添加一个item ''' text_node = self.dd.createElement('item') text_node.setAttribute('text', html_node_text_list.eq(i).text()) childnode.appendChild(text_node) xmlnode.appendChild(childnode) def transXml_Str(self,url): ''' 返回xml源码 以此格式储存 ''' strr='<html></html>' titleText = self.getTitleText() self.dd = dom.parseString(strr) html = self.dd.firstChild html.setAttribute('url', url) #为如下标签设立记录 for tag in ['title','b', 'h1', 'h2', 'h3']: self.__xmlAppendNodesTextList(html, tag) #生成a aa=self.htmlparser.getALink_list() a=self.dd.createElement('a') for u in aa: #i=self.transurl.trans_d(i) #对url转化为标准绝对地址 aindex=self.dd.createElement('item') aindex.setAttribute('title',u[0]) #aindex.setAttribute('href',self.a_trav(aa[i])) aindex.setAttribute('href',self.judger.transToStdUrl(url, u[1])) a.appendChild(aindex) html.appendChild(a) #加入content #htmltext=self.d.html().decode('gbk','ignore').encode('utf-8') #ht=pq(htmltext) #bug 说明 #此处 需啊注意 其中有html的特殊字符 &# 等等 #在分词的时候另外说明 content=self.d.text() cc=self.dd.createElement('content') ctext=self.dd.createTextNode(content) cc.appendChild(ctext) html.appendChild(cc) #print self.dd.toprettyxml() return html.toxml()
def __init__(self, predictor, input_path='./input', output='./out'): self.predictor = predictor self.input_path = input_path self.output_path = output self.judger = Judger('./data/accu.txt', './data/law.txt') self.cnt = 0
def run(self): with open(self.__SampleListFile, 'w', encoding='utf-8') as fp: scaned_files, sampled_files, err_counters = 0, 0, [ 0, 0, 0, 0, 0, 0 ] for initial_path in self.__InitialPaths: for dir_path, dir_names, file_names in os.walk(initial_path): if False in [ not match(excluded_path, dir_path) for excluded_path in self.__ExcludedPaths ]: # 跳过例外目录 dir_names[:] = [] # 跳过例外目录的子目录 continue if not os.access(dir_path, os.X_OK | os.R_OK): # 有的目录下面的循环拦不住! log.warning('[Permission Denied:] ' + dir_path) continue for dir_name in dir_names: # 对无权进入的子目录,从扫描列表中清除并记录告警日志 dir_fullname = os.path.join(dir_path, dir_name) if not os.access(dir_fullname, os.X_OK | os.R_OK): dir_names.remove(dir_name) log.warning('[Permission denied:] ' + dir_fullname) if len(file_names ) > self.__MaxFiles: # 目录下文件特别多,很可能是数据文件目录 log.warning('[Too Many Files]( ' + str(len(file_names)) + '), Ignoring:' + dir_path) continue timer = time.time() for file_name in file_names: try: scaned_files += 1 if scaned_files % 1000 == 0: log.info( 'Files scaned:[%d], error[%d], inactive[%d], small[%d], wrong-type[%d], non-text[%d], candidate[%d]\t%s' % (scaned_files, err_counters[0], err_counters[1], err_counters[2], err_counters[3], err_counters[4] + err_counters[5], sampled_files, dir_path)) if time.time( ) - timer > self.__MaxSeconds: # Too slow to scan a folder log.warning( '[Too slow to scan, Ignoring:]( ' + dir_path) break time.sleep(self.__SleepSeconds) # 防止过多占有系统资源 file_fullname = os.path.join(dir_path, file_name) rc = Judger.filter(file_fullname) if type(rc) is int: # 该文件不是候选日志,无需采 err_counters[rc] += 1 continue print(file_fullname, file=fp) sampled_files += 1 except Exception as err: # 出现过目录/文件名为乱字符导致写fp文件出现字符集异常情况 log.error(str(err)) log.info( 'Finish scan:[%d], error[%d], inactive[%d], small[%d], wrong-type[%d], non-text[%d], candidate[%d]' % (scaned_files, err_counters[0], err_counters[1], err_counters[2], err_counters[3], err_counters[4] + err_counters[5], sampled_files))
#accu = train_SVC(vec, accu_label) print('law SVC') sys.stdout.flush() #law = train_SVC(vec, law_label) print('time SVC') sys.stdout.flush() #time = train_SVC(vec, time_label) #test print('predict') sys.stdout.flush() predictor = PredictorLocal(tfidf, accu, law, time) test_label, test_predict = predictor.predict_file(test_filename) #metrics judge = Judger("../baseline/accu.txt", "../baseline/law.txt") result = judge.test2(test_label, test_predict) print(result) rst = judge.get_score(result) print(rst) rstr = "ACCU:(%.4f, %.4f, %.4f); LAW:(%.4f, %.4f, %.4f) TIME: %.4f"% \ (rst[0][0], rst[0][1], rst[0][2], rst[1][0], rst[1][1], rst[1][2], rst[2]) sinfo = 'Prog:%s TrainFile:%s Seg:%s DIM:%s NGRAM:%d RESULT: %s' % ( sys.argv[0], train_fname, seg_method, dim, ngram, rstr) logger.info(sinfo) print('begin test model:') print('saving model') joblib.dump(tfidf, 'predictor/model/tfidf.model')
class Collector: ''' 从html中提取相关tag内容 并组合为一定格式 并进行存储 主要作用为转化为xml格式 ''' def __init__(self, homeurls): self.htmlparser = HtmlParser() self.judger = Judger(homeurls) def init(self, html): ''' 显式刷新缓存内容 ''' self.html = html self.html.replace('"', "'") self.html.replace("'", "''") if not self.htmlparser.init(html): return False self.d = self.htmlparser.d self.d = pq(html) self.d('script').remove() self.d('SCRIPT').remove() self.d('style').remove() self.d('STYLE').remove() print '-' * 200 print self.html return True def clear_other_node(self): ''' 删除无用标签 ''' self.d('head').remove() self.d('h1').remove() self.d('h2').remove() self.d('h3').remove() self.d('b').remove() self.d('a').remove() def getTitleText(self): ''' 提取 title ''' return self.d('title').text() def getNodes(self, tag_name): ''' get a list of certain tag nodes ''' return self.d(tag_name) def __xmlAppendNodesTextList(self, xmlnode, tagname): ''' xml节点为list中每个元素添加记录 注意需要提前将链接化为 绝对链接 如 <b> <item>hello</item> <item>world</item> </b> ''' html_node_text_list = self.d(tagname) print html_node_text_list childnode = self.dd.createElement(tagname) print childnode for i in range(len(html_node_text_list)): ''' 为每个元素添加一个item ''' text_node = self.dd.createElement('item') text_node.setAttribute('text', html_node_text_list.eq(i).text()) childnode.appendChild(text_node) xmlnode.appendChild(childnode) def transXml_Str(self, url): ''' 返回xml源码 以此格式储存 ''' strr = '<html></html>' titleText = self.getTitleText() self.dd = dom.parseString(strr) html = self.dd.firstChild html.setAttribute('url', url) #为如下标签设立记录 for tag in ['title', 'b', 'h1', 'h2', 'h3']: self.__xmlAppendNodesTextList(html, tag) #生成a aa = self.htmlparser.getALink_list() a = self.dd.createElement('a') for u in aa: #i=self.transurl.trans_d(i) #对url转化为标准绝对地址 aindex = self.dd.createElement('item') aindex.setAttribute('title', u[0]) #aindex.setAttribute('href',self.a_trav(aa[i])) aindex.setAttribute('href', self.judger.transToStdUrl(url, u[1])) a.appendChild(aindex) html.appendChild(a) #加入content #htmltext=self.d.html().decode('gbk','ignore').encode('utf-8') #ht=pq(htmltext) #bug 说明 #此处 需啊注意 其中有html的特殊字符 &# 等等 #在分词的时候另外说明 content = self.d.text() cc = self.dd.createElement('content') ctext = self.dd.createTextNode(content) cc.appendChild(ctext) html.appendChild(cc) #print self.dd.toprettyxml() return html.toxml()
def evaluate(): accu_pred, law_pred = [], [] ground_truth = [] count = 0 for batch in batches_val: count += 1 feed_dict = get_feed_dict(batch) law_score, law_pred_b, accu_pred_b, loss = sess.run( [ train_model.law_score, train_model.law_prediction, train_model.prediction, train_model.loss ], feed_dict=feed_dict) if count % 100 == 0: print('valid_step:', count, 'valid loss:', loss) # accu_pred+= [[accu_class[j] for j in i] for i in utils.index_to_label(accu_pred_b, model_config.batch_size)][:len(batch)] accu_pred += [ [j + 1 for j in i] for i in utils.index_to_label(accu_pred_b, model_config.batch_size) ][:len(batch)] law_pred += law_pred_b.tolist() ground_truth += list( zip(feed_dict[train_model.label].tolist(), feed_dict[train_model.law_label].tolist())) # if count%10==0: # break if count == val_step_per_epoch: break with open('data/valid_label.txt', 'w', encoding='utf-8') as f: for each in ground_truth: for i in range(len(each[0])): if each[0][i] == 1: f.write(str(accu_class[i])) for i in range(len(each[1])): if each[1][i] == 1: f.write(', ' + str(law_class[i])) f.write('\n') with open('data/data_valid_predict.json', 'w', encoding='utf-8') as f: for i in range(len(accu_pred)): rex = {"accusation": [], "articles": [], "imprisonment": 0} rex["accusation"] = accu_pred[i] for each in law_pred[i]: # each is the index of law predicted in law_class if each > 0: rex["articles"].append(file_order[law_class[int(each)]]) print(json.dumps(rex, ensure_ascii=False), file=f) # print(rex) # f.write('{{"accusation": [0], "articles": {}, "imprisonment": 0}}'.format(law_pred[i])) J = Judger('data/accu.txt', 'data/law.txt') res = J.test('data/data_valid.json', 'data/data_valid_predict.json') total_score = 0 scores = [] for task_idx in range(2): TP_micro = 0 FP_micro = 0 FN_micro = 0 f1 = [] for class_idx in range(len(res[task_idx])): if res[task_idx][class_idx]["TP"] == 0: f1.append(0) continue TP_micro += res[task_idx][class_idx]["TP"] FP_micro += res[task_idx][class_idx]["FP"] FN_micro += res[task_idx][class_idx]["FN"] precision = res[task_idx][class_idx]["TP"] * 1.0 / ( res[task_idx][class_idx]["TP"] + res[task_idx][class_idx]["FP"]) recall = res[task_idx][class_idx]["TP"] * 1.0 / ( res[task_idx][class_idx]["TP"] + res[task_idx][class_idx]["FN"]) f1.append(2 * precision * recall / (precision + recall)) precision_micro = TP_micro * 1.0 / (TP_micro + FP_micro + 1e-6) recall_micro = TP_micro * 1.0 / (TP_micro + FN_micro + 1e-6) F1_micro = 2 * precision_micro * recall_micro / (precision_micro + recall_micro + 1e-6) F1_macro = np.sum(f1) / len(f1) total_score += 100.0 * (F1_micro + F1_macro) / 2 print( 'task id: {}, F1_micro: {}, F1_macro: {}, final score: {}'.format( task_idx + 1, F1_micro, F1_macro, 100.0 * (F1_micro + F1_macro) / 2)) scores.append([F1_micro, F1_macro]) total_score += res[2]['score'] / res[2]['cnt'] * 100 print('task id: 3, score:{}'.format(res[2]['score'] / res[2]['cnt'] * 100)) print('total score:', total_score) return total_score, scores
def get_batch(data_path, batch_id): """get a batch from data_path""" new_batch = np.load(data_path + str(batch_id) + '.npz') X_batch = new_batch['X'] y_batch = new_batch['y'] return [X_batch, y_batch] import logging import time import sys import os from judger import Judger if __name__ == '__main__': accusation_path = '../cail_0518/accu.txt' law_path = '../cail_0518/law.txt' judger = Judger(accusation_path,law_path) marked_labels_list = list() a = [] strtime = time.strftime("%Y-%m-%d-%H-%M", time.localtime()) log_name = "../logs/"+strtime+".txt" logging.basicConfig(handlers=[logging.FileHandler(log_name, 'w+', 'utf-8')], format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) batchpath = '../data_old/predictbatch/accu/' tr_batches = os.listdir(batchpath) # batch 文件名列表 n_tr_batches = len(tr_batches) X = [] y = [] maxitem = 0 allindex = 0 count = 0 threshold = []