예제 #1
0
def params_setup():
    parser = argparse.ArgumentParser()
    parser.add_argument('--attention_len', type=int, default=16)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--data_set', type=str, default='muse')
    parser.add_argument('--decay', type=int, default=0)
    parser.add_argument('--dropout', type=float, default=0.2)
    parser.add_argument('--file_output', type=int, default=1)
    parser.add_argument('--highway', type=int, default=0)
    parser.add_argument('--horizon', type=int, default=3)
    parser.add_argument('--init_weight', type=float, default=0.1)
    parser.add_argument('--learning_rate', type=float, default=1e-5)
    parser.add_argument('--max_gradient_norm', type=float, default=5.0)
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument('--model_dir', type=str, default='./models/model')
    parser.add_argument('--mts', type=int, default=1)
    parser.add_argument('--num_epochs', type=int, default=40)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--num_units', type=int, default=338)

    para = parser.parse_args()

    if para.data_set == "muse" or para.data_set == "lpd5":
        para.mts = 0

    para.logging_level = logging.INFO

    if para.attention_len == -1:
        para.attention_len = para.max_len

    create_dir(para.model_dir)

    json_path = para.model_dir + '/parameters.json'
    json.dump(vars(para), open(json_path, 'w'), indent=4)
    return para
예제 #2
0
    def __init__(self,
                 start_ratio=0.0,
                 end_ratio=0.98,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # load data from files
        # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path)
        zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path)
        zh_en_list = list(
            filter(lambda x: 'translation' in x[1] and x[1]['translation'],
                   zh_en_dict.items()))
        zh_en_list = list(
            map(
                lambda x: [[x[0]] * len(x[1]['translation']), x[1][
                    'translation']], zh_en_list))
        # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list)

        zh_data = []
        en_data = []
        length = len(zh_en_list)
        for i, val in enumerate(zh_en_list):
            if i % 50 == 0:
                progress = float(i + 1) / length * 100.
                print('\rprogress: %.2f%% ' % progress, end='')

            zh_data += val[0]
            en_data += val[1]

        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
            self.get_tokenizer()

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
예제 #3
0
 def __init__(self, para):
     DataGenerator.__init__(self, para)
     self.split = list(para.split_date)
     self.split_names = ["train", "validation", "test"]
     self.h = para.horizon
     self.DATA_PATH = os.path.join(self.DIRECTORY,
                                   para.data_set + str(self.h))
     create_dir(self.DATA_PATH)
     self._load(para)
     self._preprocess(para)
예제 #4
0
 def __init__(self, para):
     DataGenerator.__init__(self, para)
     self.h = para.horizon
     self.DATA_PATH = os.path.join(self.DIRECTORY,
                                   para.data_set + str(self.h))
     create_dir(self.DATA_PATH)
     self._download_file()
     self.split = [0, 0.6, 0.8, 1]
     self.split_names = ["train", "validation", "test"]
     self._preprocess(para)
     del self.raw_dat, self.dat
예제 #5
0
    def __init__(self,
                 _is_train,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # initialize wmt news loader
        start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio)

        # initialize news commentary loader
        start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_news_commentary_loader = zh_en_news_commentary.Loader(
            start_ratio, end_ratio, 0.2)

        # load the data
        zh_data, en_data = zh_en_wmt_loader.data()
        zh_data_2, en_data_2 = zh_en_news_commentary_loader.data()

        # combine data
        zh_data += zh_data_2
        en_data += en_data_2
        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
            self.get_tokenizer()

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
예제 #6
0
    def __init__(self, tokenizer_dir, un_preprocess_dirs,
                 data_params={}, pretrain_params={}, encoder_pl=[]):
        # initialize variables
        self.__data_params = data_params
        self.__pretrain_params = pretrain_params
        self.__encoder_pl = encoder_pl
        self.__dirs = un_preprocess_dirs

        self.__running = True
        self.__cur_index = 0
        self.__data = []
        self.__file_list = []

        self.__tokenizer = load_pkl(get_file_path(data_dir, 'tokenizer', tokenizer_dir, 'tokenizer.pkl'))

        # get the list of all files
        for dir_name in self.__dirs:
            _dir_path = create_dir(data_dir, 'un_preprocessed', dir_name)
            self.__file_list += list(map(lambda x: os.path.join(_dir_path, x), os.listdir(_dir_path)))
        self.__len_files = len(self.__file_list)

        random.seed(self.RANDOM_STATE)
        random.shuffle(self.__file_list)

        self.start()
    def __init__(self,
                 data_params={},
                 tokenizer_pl=[],
                 _tokenizer_dir='only_news_commentary'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}'
        self.tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', self.tokenizer_dir),
            'tokenizer.pkl')

        if os.path.isfile(self.tokenizer_path):
            return

        data = self.__load_from_news_commentary()
        data += self.__load_from_wmt_news()
        data += self.__load_from_um_corpus()
        data += self.__load_from_dict()

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
        self.get_tokenizer()
예제 #8
0
    def __init__(self,
                 data_params={},
                 tokenizer_pl=[],
                 _tokenizer_dir='only_news_commentary'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}'
        self.tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', self.tokenizer_dir),
            'tokenizer.pkl')

        if os.path.isfile(self.tokenizer_path):
            return

            # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)
        data = reduce(lambda x, y: x + y, data)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
        self.get_tokenizer()
예제 #9
0
    def __init__(self, data_params={}, preprocess_zh_pl=[], tokenizer_pl=[], _tokenizer_dir='only_news_commentary'):
        # initialize variables
        self.__data_params = data_params
        self.__preprocess_zh_pl = preprocess_zh_pl
        self.__tokenizer_pl = tokenizer_pl
        self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}'
        self.tokenizer_path = os.path.join(create_dir(data_dir, 'tokenizer', self.tokenizer_dir), 'tokenizer.pkl')

        if os.path.isfile(self.tokenizer_path):
            return

        # load zh en data
        data = self.__load_from_news_commentary()
        data += self.__load_from_wmt_news()

        # preprocess Chinese (word segmentation)
        zh_data, en_data = list(zip(*data))
        zh_data = self.__preprocess_zh(zh_data)
        data = list(zip(zh_data, en_data))

        # load ro en data
        data += self.__load_from_ro_en()

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
        self.get_tokenizer()
    def __init__(self, _is_train, _dataset='cdlm'):
        # initialize variables
        self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset)

        # initialize wmt news loader
        start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio)

        # initialize news commentary loader
        start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio)

        # load the data
        zh_data, en_data = zh_en_wmt_loader.data()
        zh_data_2, en_data_2 = zh_en_news_commentary_loader.data()

        # combine data
        zh_data += zh_data_2
        en_data += en_data_2
        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        self.gen_data(data, self.BATCH_SIZE_PER_FILE)
예제 #11
0
def sess_params_setup():
    sess_parser = argparse.ArgumentParser()
    sess_parser.add_argument('--attention_len', type=int, default=16)
    sess_parser.add_argument('--batch_size', type=int, default=32)
    sess_parser.add_argument('--data_set', type=str, default='muse')
    sess_parser.add_argument('--decay', type=int, default=0)
    sess_parser.add_argument('--dropout', type=float, default=0.2)
    sess_parser.add_argument('--file_output', type=int, default=1)
    sess_parser.add_argument('--highway', type=int, default=0)
    sess_parser.add_argument('--horizon', type=int, default=3)
    sess_parser.add_argument('--init_weight', type=float, default=0.1)
    sess_parser.add_argument('--learning_rate', type=float, default=1e-5)
    sess_parser.add_argument('--max_gradient_norm', type=float, default=5.0)
    sess_parser.add_argument('--mode', type=str, default='train')
    sess_parser.add_argument('--model_dir', type=str, default='./models/model')
    sess_parser.add_argument('--mts', type=int, default=1)
    sess_parser.add_argument('--num_epochs', type=int, default=40)
    sess_parser.add_argument('--num_layers', type=int, default=3)
    sess_parser.add_argument('--num_units', type=int, default=338)

    para, unknown = sess_parser.parse_known_args()
    # para = parser.parse_args()
    para.mode = "validation"
    para.mode2 = "explain"
    para.attention_len = para.highway = 16
    para.horizon = 3
    para.data_set = "traffic"
    para.batch_size = 32
    para.learning_rate = 1e-3
    para.model_dir = "./models/traffic"
    para.num_epochs = 40
    para.num_units = 25

    if para.data_set == "muse" or para.data_set == "lpd5":
        para.mts = 0

    para.logging_level = logging.INFO

    if para.attention_len == -1:
        para.attention_len = para.max_len

    create_dir(para.model_dir)

    json_path = para.model_dir + '/parameters.json'
    json.dump(vars(para), open(json_path, 'w'), indent=4)
    return para
예제 #12
0
def params_setup():
    parser = argparse.ArgumentParser()
    parser.add_argument('--attention_len', type=int, default=16)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--data_set', type=str, default='muse')
    parser.add_argument('--decay', type=int, default=0)
    parser.add_argument('--dropout', type=float, default=0.2)
    parser.add_argument('--file_output', type=int, default=1)
    parser.add_argument('--highway', type=int, default=0)
    parser.add_argument('--horizon', type=int, default=5)
    parser.add_argument('--init_weight', type=float, default=0.1)
    parser.add_argument('--learning_rate', type=float, default=1e-5)
    parser.add_argument('--max_gradient_norm', type=float, default=5.0)
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument('--initial_weights', type=str, default='')
    parser.add_argument('--model_dir', type=str, default='./models/model')
    parser.add_argument('--mts', type=int, default=1)
    parser.add_argument('--split', type=float, default=0.2)
    parser.add_argument('--num_epochs', type=int, default=40)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--num_units', type=int, default=338)
    parser.add_argument('--first_epoch', type=int, default=1)
    parser.add_argument('--save_final_model_path', type=str, default='')
    parser.add_argument('--samples', type=int, default=1)

    para = parser.parse_args()

    if para.data_set == "muse" or para.data_set == "lpd5":
        para.mts = 0

    para.logging_level = logging.DEBUG

    if para.attention_len == -1:
        para.attention_len = para.max_len

    if not 0.01 <= para.split <= 0.5:
        para.split = 0.1
        logging.error('Split param must be in (0, 1). Reset to 0.1')

    create_dir(para.model_dir)
    para.first_epoch = 1

    json_path = para.model_dir + '/parameters.json'
    json.dump(vars(para), open(json_path, 'w'), indent=4)
    return para
예제 #13
0
    def __init__(self,
                 start_ratio=0.0,
                 end_ratio=0.98,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            tmp_data = reduce(lambda x, y: x + y, data)
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*tmp_data))
            self.get_tokenizer()

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        data = reduce(lambda x, y: x + y, data)

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
예제 #14
0
    def __init__(self, _dataset='cdlm'):
        # initialize variables
        self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed',
                                               _dataset)

        zh_data, en_data = um_corpus.zh_en(get_test=False)
        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def extract_data(**context):
    logging.info('EXECUTION_DATE: %s' % context['task_instance'].execution_date)
    time_step = (context['task_instance'].execution_date - default_args['start_date']).days + 1
    logging.info('TIME_STEP: %d' % time_step)

    with open(input_file_path, mode='rb') as input_file:
        movielens = pickle.load(input_file)
        train = movielens['train'][:time_step]
        shape = movielens['shape']

        # TODO: OPTMIZE
        rows, cols, dta = np.concatenate([i[0] for i in train]), np.concatenate([i[1] for i in train]), np.concatenate([i[2] for i in train])
        train_data = coo_matrix((dta, (rows, cols)), shape=shape)
        logging.info('NNZ: %d' % train_data.nnz)

        create_dir(output_path)
        with open(output_path+'output_train.pickle', mode='wb') as output_file:
            pickle.dump(train_data, output_file)
            path = output_file.name

    # send the path for the next task
    context['task_instance'].xcom_push(key='time_step', value=time_step)
    context['task_instance'].xcom_push(key='path', value=path)
예제 #16
0
    def __init__(self, *args):
        # initialize variables
        self.__running = True
        self.__cur_index = 0
        self.__data = []
        self.__file_list = []
        self.__dirs = args

        # get the list of all files
        for dir_name in args:
            processed_dir_path = create_dir(data_dir, 'preprocessed', dir_name)
            self.__file_list += list(
                map(lambda x: os.path.join(processed_dir_path, x),
                    os.listdir(processed_dir_path)))
        self.__len_files = len(self.__file_list)

        random.seed(self.RANDOM_STATE)
        random.shuffle(self.__file_list)

        self.start()
    def __init__(self, _is_train, _dataset='cdlm'):
        # initialize variables
        self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset)

        # initialize wmt news loader
        start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio)

        # initialize news commentary loader
        start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO
        end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0
        zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio)

        # load the data
        zh_data, en_data = zh_en_wmt_loader.data()
        zh_data_2, en_data_2 = zh_en_news_commentary_loader.data()

        # um corpus data is only for training
        if _is_train:
            zh_data_3, en_data_3 = um_corpus.zh_en(get_test=False)

            # combine data
            zh_data += tuple(zh_data_3)
            en_data += tuple(en_data_3)

        # combine data
        zh_data += zh_data_2
        en_data += en_data_2

        # word segmentation for zh_data
        zh_data = utils.pipeline(seg_zh_by_jieba_pipeline, zh_data)

        data = list(zip(zh_data, en_data))

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        self.gen_data(data, self.BATCH_SIZE_PER_FILE)
예제 #18
0
    def __init__(self, start_ratio=0.0, end_ratio=0.98, _dataset='cdlm'):
        # initialize variables
        self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset)

        # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        data = reduce(lambda x, y: x + y, data)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        self.gen_data(data, self.BATCH_SIZE_PER_FILE)
예제 #19
0
parser.add_argument('--custom', type=bool, default=True)
parser.add_argument('--split_date',
                    type=list,
                    default=['20181201', '20190320'])
parser.add_argument('--dataset_address',
                    type=str,
                    default='./data/raw_time_series.parquet')
parser.add_argument('--output_dir', type=str, default='./output')

#%%
para = parser.parse_args(args=[])
para.logging_level = logging.INFO
logging_config_setup(para)

#%%
create_dir(para.model_dir)
create_dir(para.output_dir)
json_path = para.model_dir + '/parameters.json'
json.dump(vars(para), open(json_path, 'w'), indent=4)

# %%
graph = tf.Graph()
# %%
graph, model, data_generator = create_graph(para)

# %%
with tf.Session(config=config_setup(), graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    load_weights(para, sess, model)
    print_num_of_trainable_parameters()
    train(para, sess, model, data_generator)
예제 #20
0
 def _download_file(self):
     logging.info("Downloading %s dataset from Google drive..." %
                  self.para.data_set)
     create_dir(self.DATA_PATH)
     download_file_from_google_drive(self.DATASET_ID,
                                     self.DATA_FULL_PATH + ".tar")