def evaluate_dataset_point(model, data , addbase): pred_trees = [] gold_trees = [] for i, inst in enumerate(data): pred_scores = [] for tree in inst.kbest: if tree.size == inst.gold.size: pred_scores.append(model.predict(tree)) else: print 'error' pred_scores.append(-1000) #data_util.normalize(pred_scores) if addbase: data_util.normalize(inst.scores) scores = [p_s + b_s for p_s, b_s in zip(pred_scores, inst.scores)] else: scores = pred_scores max_id = scores.index(max(scores)) #print max_id,scores[max_id] for line in inst.lines[max_id]: pred_trees.append(line) pred_trees.append('\n') for line in inst.gold_lines: gold_trees.append(line) gold_trees.append('\n') res = eval_tool.evaluate(pred_trees,gold_trees) print 'f1score: %.4f' % (res[0]) return res
def evaluate_dataset_point(model, data, addbase, ratio=1): pred_trees = [] gold_trees = [] for i, inst in enumerate(data): pred_scores = [ model.predict(tree) for tree in inst.kbest if tree.size == inst.gold.size ] data_util.normalize(pred_scores) if addbase: data_util.normalize(inst.scores) scores = [ ratio * p_s + (1 - ratio) * b_s for p_s, b_s in zip(pred_scores, inst.scores) ] else: scores = pred_scores max_id = scores.index(max(scores)) #print "pred: %.4f base: %.4f" % (pred_scores[max_id],inst.scores[max_id]) for line in inst.lines[max_id]: pred_trees.append(line) pred_trees.append('\n') for line in inst.gold_lines: gold_trees.append(line) gold_trees.append('\n') res = eval_tool.evaluate(pred_trees, gold_trees) print 'ratio: %f f1score: %.4f' % (ratio, res[0]) return res
def add_embedding(self, helper, input_placeholder): """添加embedding层,shape (None, max_length, n_features*embed_size) :return: """ vocab = open(self.vocab_file) vectors = open(self.word2vec_file) pre_embeddings = np.array(np.random.randn( len(helper.tok2id) + 1, self.embed_size), dtype=np.float32) pre_embeddings[0] = 0. for word, vec in load_word_vector_mapping(vocab, vectors).items(): word = normalize(word) if word in helper.tok2id: pre_embeddings[helper.tok2id[word]] = vec logger.info("初始化 embeddings.") vocab.close() vectors.close() embed = tf.Variable(pre_embeddings, name="embed") # shape(None, max_length, n_features) features = tf.nn.embedding_lookup(embed, input_placeholder) self.embeddings = tf.reshape(features, shape=(-1, self.max_length, self.n_features * self.embed_size)) return self.embeddings
def run_policy(args): import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] from policy.model import Net model = Net(env.observation_space.shape[0], env.action_space.shape[0]) # Is GPU available? use_gpu = torch.cuda.is_available() use_gpu = False if use_gpu: model = model.cuda() model.load_state_dict(torch.load(args.model)) print("Using model: ", args.model) model.eval() latest_stat = pickle.load(open(data_util.get_latest('stats/*'), 'rb')) for i in range(args.num_rollouts): print('iteration:', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: obs = np.array(obs, dtype='float32') obs = data_util.normalize(obs, *latest_stat) if use_gpu: obs = torch.from_numpy(obs) action = model(Variable(obs.cuda(), volatile=True)).data.numpy() else: action = model(Variable(torch.from_numpy(obs), volatile=True)).data.numpy() observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns))
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None): """Extract features, reduce dimensions with a PCA and return data. Exports raw- and PCA-reduced data both in arff- and numpy-format. """ start = time.clock() self.dictVectorizer = DictVectorizer(sparse=False) filename = os.path.split(filepath)[1] directory = os.path.split(filepath)[0] plain_reader = PlaintextCorpusReader( directory, [filename], word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" + string.punctuation + "]"), sent_tokenizer=LineTokenizer(blanklines="discard"), encoding='utf8') # create new subdir for extracted data if _NEW_SUBDIR is not None: path = os.path.join(directory, _NEW_SUBDIR) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, os.path.splitext(filename)[0]) # print "path {}".format(path) else: path = os.path.splitext(filepath)[0] # print "path {}".format(path) # filepaths for weka- and numpy-files arff_filepath = path + ".arff" arff_filepath_pca = path + "_pca95.arff" numpy_filepath = path + ".npy" numpy_filepath_pca = path + "_pca95.npy" # print(":time: Reader created, time elapsed {}").format(time.clock() - start) paras = plain_reader.paras() # print(":time: Paras created, time elapsed {}").format(time.clock() - start) sents = plain_reader.sents() # print(":time: Sents created, time elapsed {}").format(time.clock() - start) # get paragraph boundaries for sliding-window self.boundaries = util.get_boundaries(paras) boundaries_backup = self.boundaries # check if all files necessary exist, if yes - unpickle/load them and return data if util.files_already_exist([ numpy_filepath_pca, ]): print "Features already extracted. Calculating clusters...\n" matrix_sklearn_pca = numpy.load(numpy_filepath_pca) return filepath, self.boundaries, matrix_sklearn_pca, len(sents) # save correct target-labels and additional info of current data targets_path = open(path + ".tbs", "wb") pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path) # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start) self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features) # self.data[year] = self.extract_features_para(paras, ind_features, dep_features) # print(":time: Features extracted, time elapsed {}").format(time.clock() - start) self.all_features = self.unified_features(self.data) # print(":time: Unified features, time elapsed {}").format(time.clock() - start) matrix_sklearn = self.feature_matrix_sklearn( self.generator_data(self.data)) # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start) matrix_sklearn = util.normalize(matrix_sklearn) # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start) print "Exporting raw-data..." util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename + "_RAW", labels_per_window, file_info=None) numpy.save(numpy_filepath, matrix_sklearn) # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape) feature_names, feature_names_part = None, None if _DO_PCA: print "PCA calculation..." matrix_sklearn_pca, feature_names = util.pca( matrix_sklearn, self.dictVectorizer.get_feature_names()) util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename + "_PCA95", labels_per_window, file_info=None) numpy.save(numpy_filepath_pca, matrix_sklearn_pca) del matrix_sklearn gc.collect() return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None): """Extract features, reduce dimensions with a PCA and return data. Exports raw- and PCA-reduced data both in arff- and numpy-format. """ start = time.clock() self.dictVectorizer = DictVectorizer(sparse=False) filename = os.path.split(filepath)[1] directory = os.path.split(filepath)[0] plain_reader = PlaintextCorpusReader( directory, [filename], word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"), sent_tokenizer=LineTokenizer(blanklines="discard"), encoding='utf8') # create new subdir for extracted data if _NEW_SUBDIR is not None: path = os.path.join(directory, _NEW_SUBDIR) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, os.path.splitext(filename)[0]) # print "path {}".format(path) else: path = os.path.splitext(filepath)[0] # print "path {}".format(path) # filepaths for weka- and numpy-files arff_filepath = path + ".arff" arff_filepath_pca = path + "_pca95.arff" numpy_filepath = path + ".npy" numpy_filepath_pca = path + "_pca95.npy" # print(":time: Reader created, time elapsed {}").format(time.clock() - start) paras = plain_reader.paras() # print(":time: Paras created, time elapsed {}").format(time.clock() - start) sents = plain_reader.sents() # print(":time: Sents created, time elapsed {}").format(time.clock() - start) # get paragraph boundaries for sliding-window self.boundaries = util.get_boundaries(paras) boundaries_backup = self.boundaries # check if all files necessary exist, if yes - unpickle/load them and return data if util.files_already_exist([numpy_filepath_pca,]): print "Features already extracted. Calculating clusters...\n" matrix_sklearn_pca = numpy.load(numpy_filepath_pca) return filepath, self.boundaries, matrix_sklearn_pca, len(sents) # save correct target-labels and additional info of current data targets_path = open(path + ".tbs", "wb") pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path) # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start) self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features) # self.data[year] = self.extract_features_para(paras, ind_features, dep_features) # print(":time: Features extracted, time elapsed {}").format(time.clock() - start) self.all_features = self.unified_features(self.data) # print(":time: Unified features, time elapsed {}").format(time.clock() - start) matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data)) # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start) matrix_sklearn = util.normalize(matrix_sklearn) # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start) print "Exporting raw-data..." util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None) numpy.save(numpy_filepath, matrix_sklearn) # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape) feature_names, feature_names_part = None, None if _DO_PCA: print "PCA calculation..." matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names()) util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None) numpy.save(numpy_filepath_pca, matrix_sklearn_pca) del matrix_sklearn gc.collect() return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)