def load_file(file): from lxml import etree from efficiency.log import show_time show_time('[Info] Loading file in') doc = etree.parse(file) show_time('[Info] Finished loading file in') return doc
def run_classifier(file): import os from efficiency.log import show_time cmd = 'cp {} ../1909_prac_cls/data/pubmed/data.csv; ' \ 'cd ../1909_prac_cls/; ' \ 'python train.py -lower ' \ '-data_dir data/pubmed -train_fname data.csv' \ ''.format(file) import pdb pdb.set_trace() os.system(cmd) show_time('[Info] Finished classification')
def save_csv(txt_native, txt_non_native, file='articles_classified.csv'): import csv from efficiency.log import show_time with open(file, 'w') as f: writeout = [('native', line) for line in txt_native] writeout += [('non_native', line) for line in txt_non_native] writer = csv.writer(f) writer.writerows(writeout) show_time('[Info] Saved {}+{} sentences to {}'.format( len(txt_native), len(txt_non_native), file))
def set_seed(seed=0, verbose=False): import random import os if seed is None: from efficiency.log import show_time seed = int(show_time()) if verbose: print("[Info] seed set to: {}".format(seed)) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) try: import numpy as np np.random.seed(seed) except ImportError: pass try: import torch torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False except ImportError: pass
def set_seed(seed): if not seed: seed = int(show_time()) print("[Info] seed set to: {}".format(seed)) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True
def get_html(url, use_proxy=False): import time import requests from efficiency.log import show_time time.sleep(interval) headers = {'User-Agent': next(user_agents)} if use_proxy: proxy = next(proxy_pool) try: r = requests.get(url, proxies={ "http": proxy, "https": proxy }, headers=headers) except Exception: proxy_pool.add_bad_proxy(proxy) get_html(url) return else: try: r = requests.get(url, headers=headers) except: show_time('[Warn] {} is blocked for {}s'.format( url, sleeper.block_secs)) save_json() sleeper.sleep(url) r = requests.get(url, headers=headers) if r.status_code == 403: show_time('[Warn] {} is blocked for {}s'.format( url, sleeper.block_secs)) save_json() sleeper.sleep(url) get_html(url) else: if r.status_code == 200: return r.text
def __init__(self, files, use_all_secs=True, save_to=1): import pickle from efficiency.log import show_time show_time('[Info] files: {}, use_all_secs:{}, save_to:{}'.format( files, use_all_secs, save_to)) with open('articles{}.pickle'.format(save_to), 'rb') as f: self.articles = pickle.load(f) # self.articles = [] # for file in files: # doc = self.load_file(file) # self.articles += self.parse(doc, file, use_all_secs=use_all_secs) # print('[Info] {} articles'.format(len(self.articles))) # self.save('articles{}.pickle'.format(save_to)) # self.save_ids('pmids{}.txt'.format(save_to)) self.filter_by_ids() self.articles_non_native, self.articles_native = \ self.split_by_nation() txt_native = self.get_txt(self.articles_native, 'articles_native{}.txt'.format(save_to)) txt_non_native = self.get_txt( self.articles_non_native, 'articles_non_native{}.txt'.format(save_to)) import pdb pdb.set_trace() txt_native, txt_non_native, eval_native, eval_non_native = \ self.postprocess_txt(txt_native, txt_non_native) self.save_csv(txt_native, txt_non_native, file='articles_classified{}.csv'.format(save_to)) self.save_csv(eval_native, eval_non_native, file='articles_eval{}.csv'.format(save_to)) self.run_classifier(file='articles_classified{}.csv'.format(save_to))
def set_seed(seed=0): import random if seed is None: from efficiency.log import show_time seed = int(show_time()) print("[Info] seed set to: {}".format(seed)) random.seed(seed) try: import numpy as np np.random.seed(seed) except ImportError: pass try: import torch torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True except ImportError: pass
parameters = [ hidden_sizes, p_ems, p_ins, p_rnns, p_outs, dropouts, char_methods, batch_sizes, char_dims, char_hidden_sizes, tag_spaces ] parameters = list(itertools.product(*parameters)) * num_repeat import pdb pdb.set_trace() dataset_name = 'chemdner' for param in parameters: hidden_size, p_em, p_in, p_rnn, p_out, dropout, char_method, batch_size, \ char_dim, char_hidden_size, tag_space = param st_time = show_time(cat_server=True) result_file_path = '/afs/csail.mit.edu/u/z/zhijing/proj/ie/data/run/az/hyperp_{}_{}'.format( dataset_name, st_time) p_rnn = '{} {}'.format(p_rnn[0], p_rnn[1]) log_msg = '\n{}, char_dim: {}, char_hidden_size: {}'.format( st_time, char_dim, char_dim) log_msg += '\nhidden_size: {}\tp_em: {}\tp_in: {}\tp_rnn: {}\tp_out: {}\tdropout: {}\tchar_method: {}\tbatch_size: {}\n'.format( hidden_size, p_em, p_in, p_rnn, p_out, dropout, char_method, batch_size) print(log_msg) command = 'CUDA_VISIBLE_DEVICES={} python -m pdb -c continue examples/NERCRF_conll.py --cuda --mode LSTM --encoder_mode lstm --char_method {} --num_epochs 150 --batch_size {} --hidden_size {} --num_layers 1 \ --char_dim {} --char_hidden_size {} --tag_space 64 --max_norm 15. --gpu_id {} --results_folder results --tmp_folder tmp --alphabets_folder data/chem/alphabets \ --learning_rate 0.005 --decay_rate 0.01 --schedule 1 --gamma 0. --o_tag O --dataset_name {} \
] parameters = list(itertools.product(*parameters)) * num_repeat parameters = [(450, 0.001, 1000, 5, 0.2, 0.33, (0.33, 0.5, 0.5), 0.5, 'gcn')] dataset_name = '03conll' results_folder = './data/run/' for param_i, param in enumerate(parameters): hidden_size, learning_rate_gcn, gcn_warmup, pretrain_lstm, \ p_em, p_in, p_rnn, p_tag, dropout = param p_rnn = '{} {} {}'.format(p_rnn[0], p_rnn[1], p_rnn[2]) misc = "{}".format(del_quote(str(param))) print("\n", misc, "\n") st_time = show_time() command = 'CUDA_VISIBLE_DEVICES={gpu_idx} python examples/NERCRF_conll.py ' \ '--cuda --mode LSTM --encoder_mode lstm ' \ '--char_method cnn --num_epochs {max_epochs} --batch_size 1 ' \ '--hidden_size {hidden_size} --num_layers 1 ' \ '--char_dim 30 --char_hidden_size 30 --tag_space 128 ' \ '--max_norm 10. --gpu_id {gpu_idx} ' \ '--alphabets_folder data/alphabets ' \ '--learning_rate 0.01 --decay_rate 0.05 --schedule 1 ' \ '--gamma 0. --o_tag O --dataset_name {dataset_name} ' \ '--dropout {dropout} --p_em {p_em} --p_in {p_in} ' \ '--p_rnn {p_rnn} --p_tag {p_tag} --unk_replace 0.0 ' \ '--bigram ' \ '--seed {seed} ' \ '--learning_rate_gcn {learning_rate_gcn} --gcn_warmup {gcn_warmup} ' \