Пример #1
0
    def load_file(file):
        from lxml import etree
        from efficiency.log import show_time

        show_time('[Info] Loading file in')
        doc = etree.parse(file)
        show_time('[Info] Finished loading file in')

        return doc
Пример #2
0
    def run_classifier(file):
        import os
        from efficiency.log import show_time

        cmd = 'cp {} ../1909_prac_cls/data/pubmed/data.csv; ' \
              'cd ../1909_prac_cls/; ' \
              'python train.py -lower ' \
              '-data_dir data/pubmed -train_fname data.csv' \
              ''.format(file)
        import pdb
        pdb.set_trace()
        os.system(cmd)
        show_time('[Info] Finished classification')
Пример #3
0
    def save_csv(txt_native, txt_non_native, file='articles_classified.csv'):
        import csv
        from efficiency.log import show_time

        with open(file, 'w') as f:
            writeout = [('native', line) for line in txt_native]
            writeout += [('non_native', line) for line in txt_non_native]

            writer = csv.writer(f)
            writer.writerows(writeout)

        show_time('[Info] Saved {}+{} sentences to {}'.format(
            len(txt_native), len(txt_non_native), file))
Пример #4
0
def set_seed(seed=0, verbose=False):
    import random
    import os

    if seed is None:
        from efficiency.log import show_time
        seed = int(show_time())
    if verbose: print("[Info] seed set to: {}".format(seed))

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    try:
        import numpy as np
        np.random.seed(seed)
    except ImportError:
        pass

    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    except ImportError:
        pass
 def set_seed(seed):
     if not seed:
         seed = int(show_time())
     print("[Info] seed set to: {}".format(seed))
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     torch.backends.cudnn.deterministic = True
Пример #6
0
def get_html(url, use_proxy=False):
    import time
    import requests
    from efficiency.log import show_time

    time.sleep(interval)
    headers = {'User-Agent': next(user_agents)}

    if use_proxy:
        proxy = next(proxy_pool)

        try:
            r = requests.get(url,
                             proxies={
                                 "http": proxy,
                                 "https": proxy
                             },
                             headers=headers)
        except Exception:
            proxy_pool.add_bad_proxy(proxy)
            get_html(url)
            return
    else:
        try:
            r = requests.get(url, headers=headers)
        except:
            show_time('[Warn] {} is blocked for {}s'.format(
                url, sleeper.block_secs))
            save_json()

            sleeper.sleep(url)
            r = requests.get(url, headers=headers)

    if r.status_code == 403:
        show_time('[Warn] {} is blocked for {}s'.format(
            url, sleeper.block_secs))
        save_json()

        sleeper.sleep(url)
        get_html(url)
    else:
        if r.status_code == 200:
            return r.text
Пример #7
0
    def __init__(self, files, use_all_secs=True, save_to=1):
        import pickle
        from efficiency.log import show_time

        show_time('[Info] files: {}, use_all_secs:{}, save_to:{}'.format(
            files, use_all_secs, save_to))

        with open('articles{}.pickle'.format(save_to), 'rb') as f:
            self.articles = pickle.load(f)

        # self.articles = []
        # for file in files:
        #     doc = self.load_file(file)
        #     self.articles += self.parse(doc, file, use_all_secs=use_all_secs)
        # print('[Info] {} articles'.format(len(self.articles)))
        # self.save('articles{}.pickle'.format(save_to))
        # self.save_ids('pmids{}.txt'.format(save_to))

        self.filter_by_ids()
        self.articles_non_native, self.articles_native = \
            self.split_by_nation()
        txt_native = self.get_txt(self.articles_native,
                                  'articles_native{}.txt'.format(save_to))
        txt_non_native = self.get_txt(
            self.articles_non_native,
            'articles_non_native{}.txt'.format(save_to))
        import pdb
        pdb.set_trace()
        txt_native, txt_non_native, eval_native, eval_non_native = \
            self.postprocess_txt(txt_native, txt_non_native)

        self.save_csv(txt_native,
                      txt_non_native,
                      file='articles_classified{}.csv'.format(save_to))
        self.save_csv(eval_native,
                      eval_non_native,
                      file='articles_eval{}.csv'.format(save_to))
        self.run_classifier(file='articles_classified{}.csv'.format(save_to))
Пример #8
0
def set_seed(seed=0):

    import random

    if seed is None:
        from efficiency.log import show_time
        seed = int(show_time())
    print("[Info] seed set to: {}".format(seed))

    random.seed(seed)
    try:
        import numpy as np
        np.random.seed(seed)
    except ImportError:
        pass

    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
    except ImportError:
        pass
Пример #9
0
parameters = [
    hidden_sizes, p_ems, p_ins, p_rnns, p_outs, dropouts, char_methods,
    batch_sizes, char_dims, char_hidden_sizes, tag_spaces
]

parameters = list(itertools.product(*parameters)) * num_repeat
import pdb
pdb.set_trace()
dataset_name = 'chemdner'

for param in parameters:
    hidden_size, p_em, p_in, p_rnn, p_out, dropout, char_method, batch_size, \
    char_dim, char_hidden_size, tag_space = param

    st_time = show_time(cat_server=True)

    result_file_path = '/afs/csail.mit.edu/u/z/zhijing/proj/ie/data/run/az/hyperp_{}_{}'.format(
        dataset_name, st_time)
    p_rnn = '{} {}'.format(p_rnn[0], p_rnn[1])

    log_msg = '\n{}, char_dim: {}, char_hidden_size: {}'.format(
        st_time, char_dim, char_dim)
    log_msg += '\nhidden_size: {}\tp_em: {}\tp_in: {}\tp_rnn: {}\tp_out: {}\tdropout: {}\tchar_method: {}\tbatch_size: {}\n'.format(
        hidden_size, p_em, p_in, p_rnn, p_out, dropout, char_method,
        batch_size)

    print(log_msg)
    command = 'CUDA_VISIBLE_DEVICES={} python -m pdb -c continue examples/NERCRF_conll.py --cuda --mode LSTM --encoder_mode lstm --char_method {} --num_epochs 150 --batch_size {} --hidden_size {} --num_layers 1 \
				 --char_dim {} --char_hidden_size {} --tag_space 64 --max_norm 15. --gpu_id {} --results_folder results --tmp_folder tmp --alphabets_folder data/chem/alphabets \
				 --learning_rate 0.005 --decay_rate 0.01 --schedule 1 --gamma 0. --o_tag O --dataset_name {} \
Пример #10
0
]
parameters = list(itertools.product(*parameters)) * num_repeat

parameters = [(450, 0.001, 1000, 5, 0.2, 0.33, (0.33, 0.5, 0.5), 0.5, 'gcn')]

dataset_name = '03conll'
results_folder = './data/run/'

for param_i, param in enumerate(parameters):
    hidden_size, learning_rate_gcn, gcn_warmup, pretrain_lstm, \
    p_em, p_in, p_rnn, p_tag, dropout = param
    p_rnn = '{} {} {}'.format(p_rnn[0], p_rnn[1], p_rnn[2])

    misc = "{}".format(del_quote(str(param)))
    print("\n", misc, "\n")
    st_time = show_time()

    command = 'CUDA_VISIBLE_DEVICES={gpu_idx} python examples/NERCRF_conll.py ' \
              '--cuda --mode LSTM --encoder_mode lstm ' \
              '--char_method cnn --num_epochs {max_epochs} --batch_size 1 ' \
              '--hidden_size {hidden_size} --num_layers 1 ' \
              '--char_dim 30 --char_hidden_size 30 --tag_space 128 ' \
              '--max_norm 10. --gpu_id {gpu_idx} ' \
              '--alphabets_folder data/alphabets ' \
              '--learning_rate 0.01 --decay_rate 0.05 --schedule 1 ' \
              '--gamma 0. --o_tag O --dataset_name {dataset_name} ' \
              '--dropout {dropout} --p_em {p_em} --p_in {p_in} ' \
              '--p_rnn {p_rnn} --p_tag {p_tag} --unk_replace 0.0 ' \
              '--bigram ' \
              '--seed {seed} ' \
              '--learning_rate_gcn {learning_rate_gcn} --gcn_warmup {gcn_warmup} ' \