hash_length = conf.chunk_hash_length output_length = conf.chunk_NP_length split_rate = conf.chunk_split_rate batch_size = conf.batch_size nb_epoch = conf.nb_epoch model_name = os.path.basename(__file__)[:-3] folder_path = 'model/%s' % model_name if not os.path.isdir(folder_path): os.makedirs(folder_path) # the data, shuffled and split between train and test sets train_data, dev_data = load_data.load_chunk(dataset='train.txt', split_rate=split_rate) train_samples = len(train_data) dev_samples = len(dev_data) print('train shape:', train_samples) print('dev shape:', dev_samples) print() word_embedding = pd.read_csv('../preprocessing/senna/embeddings.txt', delimiter=' ', header=None) word_embedding = word_embedding.values word_embedding = np.concatenate([ np.zeros((1, emb_length)), word_embedding, np.random.uniform(-1, 1, (1, emb_length)) ])
hash_length = conf.chunk_hash_length output_length = conf.chunk_ALL_length split_rate = conf.chunk_split_rate batch_size = conf.batch_size nb_epoch = 70 #conf.nb_epoch model_name = os.path.basename(__file__)[:-3] folder_path = 'model/%s'%model_name if not os.path.isdir(folder_path): os.makedirs(folder_path) # the data, shuffled and split between train and test sets train_data, dev_data = load_data.load_chunk(dataset='train.txt', split_rate=split_rate, chunk_type="ALL") train_samples = len(train_data) dev_samples = len(dev_data) print('train shape:', train_samples) print('dev shape:', dev_samples) print() word_embedding = pd.read_csv('../preprocessing/senna/embeddings.txt', delimiter=' ', header=None) word_embedding = word_embedding.values word_embedding = np.concatenate([np.zeros((1,emb_length)),word_embedding, np.random.uniform(-1,1,(1,emb_length))]) random_embedding = pd.read_csv('../preprocessing/random/chunk_embeddings.txt', delimiter=' ', header=None) random_embedding = random_embedding.values random_embedding = np.concatenate([np.zeros((1,hash_length)),random_embedding, np.random.rand(1,hash_length)])
np.random.seed(0) # input sentence dimensions step_length = conf.chunk_step_length pos_length = conf.chunk_pos_length IOB = conf.chunk_NP_IOB_decode split_rate = conf.chunk_split_rate data = sys.argv[1] best_epoch = sys.argv[2] if data == "dev": train_data, test_data = load_data.load_chunk(dataset='train.txt', split_rate=split_rate) elif data == "test": test_data = load_data.load_chunk(dataset='test.txt') tokens = [len(x[0]) for x in test_data] print(sum(tokens)) print('%s shape:' % data, len(test_data)) model_name = os.path.basename(__file__)[9:-3] folder_path = './model/%s' % model_name model_path = '%s/model_epoch_%s.h5' % (folder_path, best_epoch) result = open('%s/predict.txt' % folder_path, 'w') print('loading model...') model = load_model(model_path) print('loading model finished.')
# add path sys.path.append('../') sys.path.append('../tools') from tools import conf from tools import load_data from tools import prepare # input sentence dimensions step_length = conf.chunk_step_length pos_length = conf.chunk_pos_length IOB = conf.chunk_NP_IOB_decode test_data = load_data.load_chunk(dataset='test.txt') best_epoch = sys.argv[1] model_name = os.path.basename(__file__)[9:-3] folder_path = './model/%s'%model_name model_path = '%s/model_epoch_%s.h5'%(folder_path, best_epoch) result = open('%s/predict.txt'%folder_path, 'w') print('loading model...') model = load_model(model_path) print('loading model finished.') for each in test_data:
sys.path.append('../') sys.path.append('../tools') from tools import load_data from tools import prepare model_path = './model/word-hash-2-auto-encoder-128/hidden_model_epoch_26.h5' w = open('../preprocessing/chunk-auto-encoder-2/conll2000-word.lst', 'w') embeddings = pd.DataFrame(columns=range(128)) print('loading model...') encoder = load_model(model_path) print('loading model finished.') train_data, dev_data = load_data.load_chunk(dataset='train.txt', split_rate=0.9) test_data = load_data.load_chunk(dataset='test.txt') all_word = [] # all word [all_word.extend(list(each[0])) for each in train_data] [all_word.extend(list(each[0])) for each in dev_data] [all_word.extend(list(each[0])) for each in test_data] all_word = [each.strip().lower() for each in all_word] all_word = list(set(all_word)) for i, word in enumerate(all_word): w.write(word + '\n') word_hashing = prepare.prepare_auto_encoder(batch=[word],