Пример #1
0
    def start(self):
        """Main training loop."""
        for i in range(CFG.num_iterations):
            print("Iteration", i + 1)

            training_data = []  # list to store self play states, pis and vs

            for j in range(CFG.num_games):
                print("Start Training Self-Play Game", j + 1)
                game = self.game.clone()  # Create a fresh clone for each game.
                self.play_game(game, training_data)

            # Save the current neural network model.
            self.net.save_model()

            # Load the recently saved model into the evaluator network.
            self.eval_net.load_model()

            # Train the network using self play values.
            self.net.train(training_data)

            # Initialize MonteCarloTreeSearch objects for both networks.
            current_mcts = MonteCarloTreeSearch(self.net)
            eval_mcts = MonteCarloTreeSearch(self.eval_net)

            evaluator = Evaluate(current_mcts=current_mcts, eval_mcts=eval_mcts,
                                 game=self.game)
            wins, losses = evaluator.evaluate()

            print("wins:", wins)
            print("losses:", losses)

            num_games = wins + losses

            if num_games == 0:
                win_rate = 0
            else:
                win_rate = wins / num_games

            print("win rate:", win_rate)

            if win_rate > CFG.eval_win_rate:
                # Save current model as the best model.
                print("New model saved as best model.")
                self.net.save_model("best_model")
            else:
                print("New model discarded and previous model loaded.")
                # Discard current model and use previous best model.
                self.net.load_model()
def evaluateVSM(targeEventFile, collFolder,k,relevTh,vsmClassifierFileName,topK):
    '''
    docs = []
    try:
        classifierFile = open(vsmClassifierFileName,"rb")
        classifier = pickle.load(classifierFile)
        classifierFile.close()
    except:    
        f = open(targeEventFile,'r')
        for url in f:
            url = url.strip()
            d = Document(url)
            if d:
                docs.append(d)
        f.close()
        docsTF = []
        for d in docs:
            wordsFreq = getFreq(d.getWords())
            docsTF.append(wordsFreq)
        
        classifier = VSMClassifier(docsTF,relevTh)
    
    evalres = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        r = classifier.calculate_score(ftext)[0]
        evalres.append(r)
        f.close()
    '''
    evaluator = Evaluate()
    evaluator.buildVSMClassifier(targeEventFile,vsmClassifierFileName,relevTh,topK)
    collFiles = []
    for j in range(k):
        
        fn = collFolder+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        o = myObj()
        o.text = ftext
        collFiles.append(o)
    res = evaluator.evaluateFC(collFiles)
    #f = open(collFolder+'evaluationRes_VSM.txt','w')
    #f.write('\n'.join([str(r) for r in res]))
    #f.close()
    #print sum(res)
    return res
Пример #3
0
    def _mock_evaluate(self):
        """
        Create a mock `Evaluate` class, with all methods that access the
        influxdb database stubbed out.

        Returns:
            Evaluate: An instance of the Evaluate class.
        """
        evaluate = Evaluate(self.PIT, self.TP, self.VERSION,
                            self._get_tmp_dir(), False, False)

        evaluate.query_influx = MagicMock()
        evaluate.query_influx = InfluxDBMock.query_influx

        return evaluate
Пример #4
0
def main(args):
    if args.save_path is not None and not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    summary_writer = tf.summary.FileWriter(os.path.join(args.save_path, 'log'))
    global_steps_counter = itertools.count()  # thread-safe

    global_net = Net(S_DIM, A_DIM, 'global', args)
    num_workers = args.threads
    workers = []

    # create workers
    for i in range(1, num_workers + 1):
        worker_summary_writer = summary_writer if i == 0 else None
        worker = Worker(i, make_env(args), global_steps_counter,
                        worker_summary_writer, args)
        workers.append(worker)

    saver = tf.train.Saver(max_to_keep=5)

    with tf.Session() as sess:
        coord = tf.train.Coordinator()
        if args.model_path is not None:
            print('Loading model...\n')
            ckpt = tf.train.get_checkpoint_state(args.model_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print('Initializing a new model...\n')
            sess.run(tf.global_variables_initializer())
        print_params_nums()
        # Start work process for each worker in a seperated thread
        worker_threads = []
        for worker in workers:
            t = threading.Thread(target=lambda: worker.run(sess, coord, saver))
            t.start()
            time.sleep(0.5)
            worker_threads.append(t)

        if args.eval_every > 0:
            evaluator = Evaluate(
                global_net, summary_writer, global_steps_counter, args)
            evaluate_thread = threading.Thread(
                target=lambda: evaluator.run(sess, coord))
            evaluate_thread.start()

        coord.join(worker_threads)
def evaluateClassifier(classifierFile,cf,k):
    
    evaluator = Evaluate()
    evaluator.buildClassifier("posFile","negFolder",classifierFile)
    collFiles = []
    for j in range(k):
        
        fn = cf+str(j)+'.txt'
        f = codecs.open(fn, encoding='utf-8')
        ftext = f.read()
        o = myObj()
        o.text = ftext
        collFiles.append(o)
    res = evaluator.evaluateFC(collFiles)
    f = open(cf+'evaluationRes_Classf.txt','w')
    f.write('\n'.join([str(r) for r in res]))
    f.close()
    print sum(res)
Пример #6
0
    def __call__(self):
        all_counts = defaultdict(dict)
        gold = sorted(Reader(open(self.gold)))
        for path in self.systems:
            system = sorted(Reader(open(path)))
            for match, per_doc, overall in Evaluate.count_all(system, gold):
                all_counts[match][path] = (per_doc, overall)

        results = [{'sys1': sys1, 'sys2': sys2,
                    'match': match,
                    'stats': self.significance(match_counts[sys1], match_counts[sys2])}
                   for sys1, sys2 in itertools.combinations(self.systems, 2)
                   for match, match_counts in sorted(all_counts.iteritems(),
                                                     key=lambda (k, v): MATCHES.index(k))]

        return self.fmt(results, self.metrics)
Пример #7
0
def compute_scores(raw_data_dir=FLAGS.raw_data, data_dir=FLAGS.data_dir,
  dataset=FLAGS.dataset, save_recommendation=FLAGS.saverec,
  train_dir=FLAGS.train_dir, test=FLAGS.test):
  
  from evaluate import Evaluation as Evaluate
  evaluation = Evaluate(raw_data_dir, test=test)
 
  R = recommend(evaluation.get_uids(), data_dir=data_dir)
  
  evaluation.eval_on(R)
  scores_self, scores_ex = evaluation.get_scores()
  mylog("====evaluation scores (NDCG, RECALL, PRECISION, MAP) @ 2,5,10,20,30====")
  mylog("METRIC_FORMAT (self): {}".format(scores_self))
  mylog("METRIC_FORMAT (ex  ): {}".format(scores_ex))
  if save_recommendation:
    name_inds = os.path.join(train_dir, "indices.npy")
    np.save(name_inds, rec)
Пример #8
0
 def walk_proximity(self,
                    trained=True,
                    num_walks=100,
                    walk_length=40,
                    workers=5):
     if trained:
         return np.loadtxt(self.walk_structure_embedding)
     walk_structure = utils.walk_proximity(self.graph.adj,
                                           num_walks,
                                           walk_length,
                                           workers=workers)
     print('游走已完成...')
     loss = Evaluate(10).loss()
     auto_encoder = SparseAE(self.args, walk_structure, loss,
                             self.walk_structure_embedding)
     embedding = auto_encoder.train(parallel=False)
     return embedding
Пример #9
0
def main():
    url = "https://race.netkeiba.com/?pid=race_old&id=n201908050411"

    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'lxml')

    race_name, distance = Get_Race_Info(soup)
    print(race_name)
    link_list, horse_list = Get_Link_List(soup)

    #print(link_list)

    for link_url, horse_name in zip(link_list, horse_list):
        df = Scraping(link_url)
        print(horse_name)
        #print(df)

        ave_list = Evaluate(df, distance)
        print(ave_list)
Пример #10
0
def main():
    prog = "python -m allennlp.run"
    subcommand_overrides = {}
    # pylint: disable=dangerous-default-value
    parser = argparse.ArgumentParser(description="Run AllenNLP",
                                     usage='%(prog)s',
                                     prog=prog)
    print(parser)

    subparsers = parser.add_subparsers(title='Commands', metavar='')

    subcommands = {
        # Default commands
        "train": Train(),
        "evaluate": Evaluate(),
        "evaluate_mlqa": Evaluate_MLQA(),
        "make-vocab": MakeVocab(),
        "fine-tune": FineTune(),
        # Superseded by overrides
        **subcommand_overrides
    }

    for name, subcommand in subcommands.items():
        subparser = subcommand.add_subparser(name, subparsers)
        subparser.add_argument('--include-package',
                               type=str,
                               action='append',
                               default=[],
                               help='additional packages to include')

    args = parser.parse_args()

    # If a subparser is triggered, it adds its work as `args.func`.
    # So if no such attribute has been added, no subparser was triggered,
    # so give the user some help.
    if 'func' in dir(args):
        # Import any additional modules needed (to register custom classes).
        for package_name in args.include_package:
            import_submodules(package_name)
        args.func(args)
    else:
        parser.print_help()
Пример #11
0
def main(out_dir, input_file, input_plus, input_minus, fa_file, keep_temp,
         window, name, model, rst, threshold, penality, DB_file):

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    out_dir = out_dir + '/' + name
    ####Generate sliding windlows
    Generate_windows(out_dir, input_file, input_plus, input_minus, fa_file,
                     keep_temp, window, name)

    data_dir = out_dir + '/data'
    data_files = glob.glob(data_dir + "/*")
    for data in data_files:
        if 'wig' in data:
            continue
        baseName = data.split('/')[-1]
        Evaluate(model, out_dir, rst, window, baseName, keep_temp)
        Scan_Forward(baseName, threshold, penality, out_dir)
        Scan_Backward(baseName, threshold, penality, out_dir)
        if (keep_temp != 'yes'):
            predict_file = out_dir + '/predict/' + baseName + '.txt'
            os.system('rm %s' % predict_file)
        Postprocess(DB_file, baseName, threshold, penality, out_dir)
        if (keep_temp != 'yes'):
            forward_file = out_dir + "/maxSum/%s.forward.%d.%d.txt" % (
                baseName, threshold, penality)
            backward_file = out_dir + "/maxSum/%s.backward.%d.%d.txt" % (
                baseName, threshold, penality)
            os.system('rm %s %s' % (forward_file, backward_file))

    out_file = '%s/%s.predicted.txt' % (out_dir, name)
    ww = open(out_file, 'w')
    ww.write('predicted_pasid\tdb_diff\tdb_pasid\tscore\n')
    ww.close()
    os.system('cat %s/maxSum/*bidirection* >>%s' % (out_dir, out_file))
    if (keep_temp != 'yes'):
        os.system('rm -rf %s/data %s/predict %s/maxSum' %
                  (out_dir, out_dir, out_dir))

    print("Job Done!")
Пример #12
0
    def __call__(self):
        all_counts = defaultdict(dict)
        gold = sorted(Reader(open(self.gold)))
        for path in self.systems:
            system = sorted(Reader(open(path)))
            for match, per_doc, overall in Evaluate.count_all(system, gold):
                all_counts[match][path] = (per_doc, overall)

        results = [
            {
                'sys1': sys1,
                'sys2': sys2,
                'match': match,
                'stats': self.significance(match_counts[sys1],
                                           match_counts[sys2])
            } for sys1, sys2 in itertools.combinations(self.systems, 2)
            for match, match_counts in sorted(
                all_counts.iteritems(), key=lambda (k, v): MATCHES.index(k))
        ]

        return self.fmt(results, self.metrics)
Пример #13
0
    def _train(self, fold_n, x_trn, y_trn, x_val, y_val, num_class=2):
        # 初始化模型
        model, emb = self.create_model(num_class)
        opt = Adam(0.01)
        model.compile(optimizer=opt, loss=categorical_crossentropy)

        patient, best_score = 0, 0
        best_embedding = None
        for epoch in range(2000):
            generator = utils.batch_iter(x_trn, self.batch_size)
            for index in generator:
                if self.types == 'classes':
                    model.train_on_batch([x_trn[index]],
                                         np.eye(num_class)[y_trn[index]])
                if self.types == 'link':
                    vi, vj = x_trn[index][:, 0], x_trn[index][:, 1]
                    model.train_on_batch(
                        [vi, vj],
                        np.eye(num_class)[y_trn[index].reshape(-1).astype(
                            int)])

            if self.types == 'classes':
                y_val_pred = np.argmax(model.predict([x_val]), -1)
                micro, macro = Evaluate.f1(y_val, y_val_pred)
                print('fold_{}:,{},{}'.format(fold_n, micro, macro))
                score = micro + macro
            if self.types == 'link':
                y_val_pred = np.argmax(
                    model.predict([x_val[:, 0], x_val[:, 1]]), -1)
                score = roc_auc_score(y_val, y_val_pred)
                print('fold_{}:,{},{}'.format(fold_n, score, best_score))

            if score > best_score:
                patient = 0
                best_score = score
                best_embedding = emb.get_weights()[0]
            patient += 1
            if patient >= 50:
                break
        return best_embedding
Пример #14
0
    def __call__(self,
                 number_of_iterations=2,
                 learning_rate=0.005,
                 embedding_size=300,
                 hidden_size=100,
                 batch_size=100):
        print("Starting 'Image Retrieval' in 'GRU' mode with '" +
              self.difficulty + "' data")

        self.model_full_path = self.model_path + "/" + self.model_name + "_" + self.timestamp + "_" + str(
            learning_rate) + "_" + str(embedding_size) + ".pty"
        self.output_file_name = self.output_path + "/" + self.model_name + "_" + self.timestamp + "_" + str(
            learning_rate) + "_" + str(embedding_size) + ".csv"

        self.number_of_iterations = number_of_iterations
        self.learning_rate = learning_rate
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.model = GRU(self.nwords, self.embedding_size,
                         self.image_feature_size, self.output_vector_size,
                         self.hidden_size, self.batch_size)
        self.criterion = nn.CrossEntropyLoss()

        self.evaluate = Evaluate(self.model, self.img_features, self.minibatch,
                                 self.preprocess, self.image_feature_size,
                                 self.output_vector_size)
        print(self.model)

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        self.train_loss_values = []

        self.magic()

        self.save_model()

        self.save_data()
Пример #15
0
def test(RL):
    env = envR(show=False)
    path, cost, density, num_find_target, opt_cost = [], [], [], 0, []
    evaluate = Evaluate(rows=10, cols=10)
    train = False
    succ = 0
    print("****************************************************")
    for episode in range(100):
        pre_maps = env.reset()
        step = 0
        evaluate.set_start(start_pos=env.agent)
        evaluate.set_goals(real_pos=env.maze.food_pos[0],
                           fake_pos=env.maze.food_pos[1])
        # print("****************************************************")
        # print("EPISODE ", episode)
        # start_test = time.time()
        for step in range(100):

            action = RL.choose_action(str(pre_maps), train)

            reward, done, action_ = env.step(action)

            path.append(action_)

            step += 1
            if done:
                succ += 1
                cost, density, num_find_target, opt_cost = evaluation(
                    evaluate, cost, density, num_find_target, opt_cost, path)
                path = []
                break
            pre_maps = env.get_maps()
    print('This is ', episode, 'cost:', step, 'succ', succ)
    print('average cost:', np.mean(cost), ' average density:',
          np.mean(density), ' deceptive extent:', num_find_target / succ)
    print('optimal cost:', np.mean(opt_cost))
    print()
Пример #16
0
class Train():
    def __init__(self, difficulty):
        self.data_path = "../data"
        self.model_path = "../models"
        self.output_path = "../outputs"
        self.difficulty = difficulty
        self.timestamp = str(int(time.time()))
        self.model_name = "regression_" + self.difficulty
        self.data = Data(difficulty=self.difficulty, data_path=self.data_path)
        (self.img_features, self.w2i, self.i2w, self.nwords, self.UNK,
         self.PAD) = self.data()
        self.train = list(self.data.get_train_data())
        self.dev = list(self.data.get_validation_data())
        self.test = list(self.data.get_test_data())
        self.image_feature_size = 2048
        self.output_vector_size = 10

    def __call__(self,
                 number_of_iterations=2,
                 learning_rate=0.005,
                 embedding_size=300):
        print("Starting 'Image Retrieval' in 'Regression' mode with '" +
              self.difficulty + "' data")

        self.model_full_path = self.model_path + "/" + self.model_name + "_" + self.timestamp + "_" + str(
            learning_rate) + "_" + str(embedding_size) + ".pty"
        self.output_file_name = self.output_path + "/" + self.model_name + "_" + self.timestamp + "_" + str(
            learning_rate) + "_" + str(embedding_size) + ".csv"

        self.number_of_iterations = number_of_iterations
        self.learning_rate = learning_rate
        self.embedding_size = embedding_size

        self.model = Regression(self.nwords, self.embedding_size,
                                self.image_feature_size,
                                self.output_vector_size)
        self.criterion = nn.MSELoss()

        self.evaluate = Evaluate(self.model, self.img_features, self.minibatch,
                                 self.preprocess, self.image_feature_size)
        print(self.model)

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        self.train_loss_values = []
        self.dev_loss_values = []
        self.test_loss_values = []

        self.magic()

        self.save_model()

        self.save_data()

    def minibatch(self, data, batch_size=50):
        for i in range(0, len(data), batch_size):
            yield data[i:i + batch_size]

    def preprocess(self, batch):
        """Helper function for functional batches"""
        correct_indexes = [observation[2] for observation in batch]
        img_ids = [observation[1] for observation in batch]
        text_features = [observation[0] for observation in batch]

        #Add Padding to max len of sentence in batch
        max_length = max(map(len, text_features))
        text_features = [
            txt + [self.PAD] * (max_length - len(txt)) for txt in text_features
        ]

        #return in "stacked" format
        return text_features, img_ids, correct_indexes

    def magic(self):
        for ITER in range(self.number_of_iterations):

            random.shuffle(self.train)
            train_loss = 0.0
            start = time.time()

            for iteration, batch in enumerate(self.minibatch(self.train)):
                #Outputs matrices of batch size
                text_features, h5_ids, correct_index = self.preprocess(batch)
                lookup_text_tensor = Variable(torch.LongTensor([text_features
                                                                ])).squeeze()

                target = np.empty([len(batch), self.image_feature_size])
                for obs, img_ids in enumerate(h5_ids):
                    target[obs] = self.img_features[img_ids[
                        correct_index[obs]]]

                target = Variable(
                    torch.from_numpy(target).type(torch.FloatTensor))

                #Run model and calculate loss
                prediction = self.model(lookup_text_tensor)
                loss = self.criterion(prediction, target)
                train_loss += loss.data[0]

                self.optimizer.zero_grad()
                self.model.zero_grad()
                loss.backward()
                self.optimizer.step()

                #if iteration % verbosity_interval == 0:
                #    print("ITERATION %r: %r: train loss/sent=%.4f, time=%.2fs" % (ITER+1, iteration, train_loss/(iteration + 1), time.time() - start))

            print(
                "ITERATION %r: train loss/sent=%.4f, time=%.2fs" %
                (ITER + 1, train_loss / len(self.train), time.time() - start))
            #print("Score on training", evaluate(train))
            #print("Score on development", evaluate(dev))
            self.train_loss_values.append(train_loss / len(self.train))
            self.dev_loss_values.append(self.evaluate.calculate_loss(self.dev))
            self.test_loss_values.append(
                self.evaluate.calculate_loss(self.test))

    def save_model(self):
        #Save model
        torch.save(self.model, self.model_full_path)
        print("Saved model has test score", self.evaluate(self.test))

    def plot(self):
        plt.plot(self.train_loss_values, label="Train loss")
        plt.plot(self.dev_loss_values, label="Validation loss")
        plt.plot(self.test_loss_values, label="Test loss")
        plt.legend(loc='best')
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.title(self.model_name +
                  " - has loss with lr = %.4f, embedding size = %r" %
                  (self.learning_rate, self.embedding_size))
        plt.show()

    def save_data(self):
        file = open(self.output_file_name, "w")
        file.write(", ".join(map(str, self.train_loss_values)))
        file.write("\n")
        file.write(", ".join(map(str, self.dev_loss_values)))
        file.write("\n")
        file.write(", ".join(map(str, self.test_loss_values)))
        file.write("\n")
        file.write(str(self.evaluate(self.dev)))
        file.write("\n")
        file.close()
Пример #17
0
class DataHandler:
    evaluator = Evaluate()  # 클래스 변수 : 클래스의 모든 인스턴스들이 공유하는 변수 (연산기)
    # 객체합성 (선생님의 캡슐화 범위)

    # class method : 전역함수처럼 사용
    @classmethod  # 인스턴스 메소드로 정의해도 문제는 없다
    def GetRawdataInDic(cls, filename):
        rawdata = {}
        with open(filename, 'rb') as f:
            while 1:
                try:
                    data = pickle.load(f)
                except EOFError:
                    break
                rawdata.update(data)

        return rawdata

    def __init__(self, filename, clsname):
        self.rawdata = DataHandler.GetRawdataInDic(filename)
        self.clsname = clsname
        self.cache = {}  # 연산한 값을 저장해 두는 저장소
        #(필요할 때 연산하되 이미 연산된 값이면 연산 없이 저장된 값을 반환)

    def get_scores(self):  # cache기법 사용
        if 'scores' not in self.cache:
            self.cache['scores'] = list(self.rawdata.values())

        return self.cache.get('scores')

    def get_average(self):  # cache 사용
        if 'average' not in self.cache:
            self.cache['average'] = self.evaluator.average(self.get_scores())

        return self.cache.get('average')

    def get_variance(self):  # cache 사용
        if 'variance' not in self.cache:
            vari = round(
                self.evaluator.variance(self.get_scores(), self.get_average()),
                1)
            self.cache['variance'] = vari

        return self.cache.get('variance')

    def get_standard_deviation(self):
        if 'standard_deviation' not in self.cache:
            std_dev = round(math.sqrt(self.get_variance()), 1)
            self.cache['standard_deviation'] = std_dev

        return self.cache.get('standard_deviation')

    def GetEvaluation(self):
        print('*' * 50)
        print("%s 반 성적 분석 결과" % self.clsname)
        print("{0}반의 평균은 {1}점이고 분산은 {2}이며,따라서 표준편차는{3}이다".\
              format(self.clsname, self.get_average(), self.get_variance()\
                     , self.get_standard_deviation()))
        print('*' * 50)
        print("%s 반 종합 평가" % self.clsname)
        print('*' * 50)
        self.evaluateClass()

    def evaluateClass(self):
        avrg = self.get_average()
        std_dev = self.get_standard_deviation()

        if avrg < 50 and std_dev > 20:
            print("성적이 너무 저조하고 학생들의 실력 차이가 너무 크다.")
        elif avrg > 50 and std_dev > 20:
            print("성적은 평균이상이지만 학생들 실력 차이가 크다. 주의 요망!")
        elif avrg < 50 and std_dev < 20:
            print("학생들간 실력차는 나지 않으나 성적이 너무 저조하다. 주의 요망!")
        elif avrg > 50 and std_dev < 20:
            print("성적도 평균 이상이고 학생들의 실력차도 크지 않다.")

#     def who_is_highest(self):
#         h_score= max(list(self.rawdata.values()))
#         for k, v in self.rawdata.items():
#             if v == h_score:
#                 return k
#     def get_highest_score(self):
#         return max(list(self.rawdata.values()))

# 선생님 코드

    def who_ist_highest(self):
        if 'highest' not in self.cache:
            self.cache['highest'] = reduce(
                lambda a, b: a
                if self.rawdata.get(a) > self.rawdata.get(b) else b,
                self.rawdata.keys())
        return self.cache.get('highest')

    def get_highest_score(self):
        return self.rawdata[self.who_ist_highest()]

    def who_is_lowest(self):
        if 'lowest' not in self.cache:
            self.cache['lowest'] = reduce(
                lambda a, b: a
                if self.rawdata.get(a) < self.rawdata.get(b) else b,
                self.rawdata.keys())
        return self.cache.get('lowest')

    def get_lowest_score(self):
        return self.rawdata[self.who_is_lowest()]
Пример #18
0
    if fit:
        print("Fit tokenizer...")
        tokenizer = text.Tokenizer(nb_words=vocab_size)
        tokenizer.fit_on_texts(text_generator())
        print("Save tokenizer...")
        f = open(tokenizer_fname, "wb")
        cPickle.dump(tokenizer, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()

    else:
        print('Load tokenizer...')
        f = open(tokenizer_fname, "rb")
        tokenizer = cPickle.load(f)
        f.close()

    evaluator = Evaluate(tokenizer, words, context, average_scores) 
    sampling_table = sequence.make_sampling_table(vocab_size)

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        progbar = Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []
        batch_loss = []
        for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
            # get skipgram couples for one text in the dataset
            couples, labels = skipgrams_l2c_fast(seq, vocab_size, num_senses =num_senses, window_size=4, negative_samples=1., sampling_table=sampling_table)
            if couples:
    #posFiles = ['pos-FSU.txt','pos-Hagupit.txt','pos-LAFire.txt','pos-AirAsia.txt']
    posFiles = ['pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt','pos-sydneyseige.txt','pos-Charlie.txt']
    #negFolder = 'neg'
    negFiles = ['neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt','neg-sydneyseige.txt','neg-Charlie.txt']
    
    '''
    seedsFiles=['seedsURLs_z_501.txt','seedsURLs_z_540.txt']
    
    #posFiles = ['pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt']
    #negFiles = ['neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt']
    
    posFiles = ['pos-Charlie.txt','pos-sydneyseige.txt']
    negFiles = ['neg-Charlie.txt','neg-sydneyseige.txt']
    '''
    
    evaluator = Evaluate()
    #for i in range(3):
    noK = 10
    th = 0.75
    i=3
    posFile = posFiles[i]
    negFile = negFiles[i]
    #modelFile = modelFile +"-"+str(i)+".txt"
    #classifierFileName = 'classifier'+posFile.split(".")[0].split('-')[1]+".p"
    vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p"
    #evaluator.buildClassifier(posFile,negFolder,classifierFileName)
    #evaluator.buildClassifier(posFile,negFile,classifierFileName)
    evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,th,noK)

    v = 0
Пример #20
0
def doEvaluate():
    eva = Evaluate()
    eva.eval()
Пример #21
0
def main():
    args = parse_args()
    model_dir = args.model_dir
    """LOAD CONFIG FILE"""
    config_files = glob.glob(os.path.join(model_dir, '*.ini'))
    assert len(config_files) == 1, 'Put only one config file in the directory'
    config_file = config_files[0]
    config = configparser.ConfigParser()
    config.read(config_file)
    """LOGGER"""
    logger = getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')

    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    log_file = model_dir + 'log.txt'
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info('[Training start] logging to {}'.format(log_file))
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_type = config['Parameter']['vocab_type']
    vocab_size = int(config['Parameter']['vocab_size'])
    """TRINING DETAIL"""
    gpu_id = args.gpu
    n_epoch = args.epoch
    batch_size = args.batch
    interval = args.interval
    """DATASET"""
    train_src_file = config['Dataset']['train_src_file']
    train_trg_file = config['Dataset']['train_trg_file']
    valid_src_file = config['Dataset']['valid_src_file']
    valid_trg_file = config['Dataset']['valid_trg_file']
    test_src_file  = config['Dataset']['test_src_file']
    correct_txt_file = config['Dataset']['correct_txt_file']

    train_data_size = dataset.data_size(train_trg_file)
    valid_data_size = dataset.data_size(valid_trg_file)
    logger.info('train size: {0}, valid size: {1}'.format(train_data_size, valid_data_size))

    if vocab_type == 'normal':
        init_vocab = {'<unk>': 0, '<s>': 1, '</s>': 2, '<eod>': 3}
        vocab = dataset.VocabNormal()
        vocab.make_vocab(train_src_file, train_trg_file, init_vocab, vocab_size, freq=0)
        dataset.save_pickle(model_dir + 'src_vocab.pkl', vocab.src_vocab)
        dataset.save_pickle(model_dir + 'trg_vocab.pkl', vocab.trg_vocab)
        sos = vocab.src_vocab['<s>']
        eos = vocab.src_vocab['</s>']
        eod = vocab.src_vocab['<eod>']

    elif vocab_type == 'subword':
        vocab = dataset.VocabSubword()
        if os.path.isfile(model_dir + 'src_vocab.sub.model') and os.path.isfile(model_dir + 'trg_vocab.sub.model'):
            vocab.load_vocab(model_dir + 'src_vocab.sub.model', model_dir + 'trg_vocab.sub.model')
        else:
            vocab.make_vocab(train_trg_file + '.sub', train_trg_file + '.sub', model_dir, vocab_size)
        sos = vocab.src_vocab.PieceToId('<s>')
        eos = vocab.src_vocab.PieceToId('</s>')
        eod = vocab.src_vocab.PieceToId('<eod>')

    src_vocab_size = len(vocab.src_vocab)
    trg_vocab_size = len(vocab.trg_vocab)
    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(src_vocab_size, trg_vocab_size))

    train_iter = iterator.Iterator(train_src_file, train_trg_file, batch_size, sort=True, shuffle=True)
    # train_iter = iterator.Iterator(train_src_file, train_trg_file, batch_size, sort=False, shuffle=False)
    valid_iter = iterator.Iterator(valid_src_file, valid_trg_file, batch_size, sort=False, shuffle=False)
    evaluater = Evaluate(correct_txt_file)
    test_iter = iterator.Iterator(test_src_file, test_src_file, batch_size, sort=False, shuffle=False)
    """MODEL"""
    model = HiSeq2SeqModel(
        WordEnc(src_vocab_size, embed_size, hidden_size, dropout_ratio),
        WordDec(trg_vocab_size, embed_size, hidden_size, dropout_ratio),
        SentEnc(hidden_size, dropout_ratio),
        SentDec(hidden_size, dropout_ratio),
        sos, eos, eod)
    """OPTIMIZER"""
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """TRAIN"""
    sum_loss = 0
    loss_dic = {}
    for epoch in range(1, n_epoch + 1):
        for i, batch in enumerate(train_iter.generate(), start=1):
            print(batch)
            exit()
            batch = vocab.convert2label(batch)
            data = converter.convert(batch, gpu_id)
            loss = optimizer.target(*data)
            sum_loss += loss.data
            optimizer.target.cleargrads()
            loss.backward()
            optimizer.update()

            if i % interval == 0:
                logger.info('E{} ## iteration:{}, loss:{}'.format(epoch, i, sum_loss))
                sum_loss = 0
        chainer.serializers.save_npz(model_dir + 'model_epoch_{}.npz'.format(epoch), model)
        # chainer.serializers.save_npz(model_dir + 'optimizer_epoch{0}.npz'.format(epoch), optimizer)

        """EVALUATE"""
        valid_loss = 0
        for batch in valid_iter.generate():
            batch = vocab.convert2label(batch)
            data = converter.convert(batch, gpu_id)
            with chainer.no_backprop_mode(), chainer.using_config('train', False):
                valid_loss += optimizer.target(*data).data
        logger.info('E{} ## val loss:{}'.format(epoch, valid_loss))
        loss_dic[epoch] = valid_loss

        """TEST"""
        output = []
        for batch in test_iter.generate():
            # batch: (articlesのリスト, abstracts_sosのリスト, abstracts_eosのリスト)タプル
            batch = vocab.convert2label(batch)
            data = converter.convert(batch, gpu_id)
            """
            out: [(sent, attn), (sent, attn), ...] <-バッチサイズ
            sent: decodeされた文のリスト
            attn: 各文のdecode時のattentionのリスト
            """
            with chainer.no_backprop_mode(), chainer.using_config('train', False):
                out = model.generate(data[0], data[3])
            output.extend(out)

        res_decode = []
        res_attn = []
        for o in output:
            sent, attn = o
            sentence = dataset.to_list(sent)
            sentence = dataset.eod_truncate(sentence, eod)
            sent_num = len(sentence)
            sentence = [dataset.eos_truncate(s, eos) for s in sentence]
            sentence = [vocab.label2word(s) for s in sentence]
            sentence = dataset.join_sentences(sentence)
            res_decode.append(sentence)
            attn = np.sum(np.array(attn[:sent_num]), axis=0) / sent_num
            res_attn.append(attn)

        rank_list = evaluater.rank(res_attn)
        single = evaluater.single(rank_list)
        multiple = evaluater.multiple(rank_list)
        logger.info('E{} ## precision'.format(epoch))
        logger.info('single: {} | {}'.format(single[0], single[1]))
        logger.info('multi : {} | {}'.format(multiple[0], multiple[1]))

        with open(model_dir + 'model_epoch_{}.hypo'.format(epoch), 'w')as f:
            [f.write(r + '\n') for r in res_decode]
        with open(model_dir + 'model_epoch_{}.attn'.format(epoch), 'w')as f:
            [f.write('{}\n'.format(r)) for r in res_attn]
        with open(model_dir + 'model_epoch_{}.prec'.format(epoch), 'w')as f:
            f.write('single\n')
            f.write(single[0] + '\n')
            f.write(single[1] + '\n')
            f.write('multiple\n')
            f.write(multiple[0] + '\n')
            f.write(multiple[1] + '\n')

    """MODEL SAVE"""
    best_epoch = min(loss_dic, key=(lambda x: loss_dic[x]))
    logger.info('best_epoch:{0}'.format(best_epoch))
    chainer.serializers.save_npz(model_dir + 'best_model.npz', model)
Пример #22
0
def test(test_filename, time_now, title, **kwargs):
    # 超参数设置
    network_name = get_value_or_default(kwargs, 'network', default='LSTM')
    affect = get_value_or_default(kwargs, 'affect', default=30)
    filename = test_filename
    column = get_value_or_default(kwargs, 'column', default='ClPr')
    index_col = 'TrdDt'
    batch_size = 1
    plot_name = get_value_or_default(kwargs, 'plot_name', default=['fig1', ])

    # 加载数据
    data = Action.generate_df(
        filename,
        column,
        index_col,
        affect
    )
    data_loader = DataLoader(data['dataset'], batch_size=batch_size, shuffle=False)

    net = torch.load('save/{}.pt'.format(network_name))

    predict = list()
    for tx, ty in data_loader:
        output = net(tx.reshape(1, batch_size, affect))
        output = output.reshape(1).detach()
        predict.append(float(output) * data['std'] + data['mean'])

    plt1 = Plot(1, time_now, network_name)
    plt1.plot(data['index'], data['real_data'][affect:], 'real data')
    plt1.plot(data['index'], predict, 'predict data')
    plt1.title(title, zh=True)
    plt1.xylabel('Datetime', 'price')
    plt1.save(plot_name[0])
    # Plot.show()
    Plot.cla()

    evaluator = Evaluate(title, data['real_data'][affect:], predict)

    logger = Logger('test.log')
    basic_info = 'tested {}.'.format(network_name)
    logger.set_log(basic_info,
                   t=time_now,
                   filename=filename,
                   column=column,
                   affect_days=affect,
                   network=net,
                   plot_name=plot_name,
                   MSELoss=evaluator.MSELoss(),
                   DA=evaluator.DA(),
                   Theil=evaluator.Theil_U(),
                   L1Loss=evaluator.L1Loss(),
                   Customize=evaluator.customize(),
                   title=title,
                   MAPE=evaluator.MAPE(),
                   R=evaluator.R()
                   )
    f_out = open('log/{}.txt'.format(title), 'w')
    print('{} = {}'.format('time', time_now),
          '{} = {}'.format('MSELoss', evaluator.MSELoss()),
          '{} = {}'.format('DA', evaluator.DA()),
          '{} = {}'.format('Theil_U', evaluator.Theil_U()),
          '{} = {}'.format('L1Loss', evaluator.L1Loss()),
          '{} = {}'.format('MAPE', evaluator.MAPE()),
          '{} = {}'.format('R', evaluator.R()),
          file=f_out,
          sep='\n')
    f_out.close()
    return evaluator
Пример #23
0
 def evaluate():
     eval = Evaluate(self.arch, self.params, self.train_dir)
Пример #24
0
#!/usr/bin/env python
from evaluate import Evaluate

if __name__ == '__main__':
    ev = Evaluate(timesteps=2000)

    exp = './'
    filename = exp + '/output_truth.txt'
    truth = ev.get_data(filename)

    filename = exp + '/output_bckgd.txt'
    analy = ev.get_data(filename)

    #  print('file  :',filename)
    #  print('truth : ',truth.shape,' analysis ',analy.shape)
    ev.plot_state(truth, analy)
Пример #25
0
def main(train_file_to_use, test_file_to_use, test_type, features_combination_list, lamda, comp):
    # for perm in itertools.combinations(features_combination_list_sub, 4):
    #    features_combination_list.append(list(perm))

    # start all combination of features
    for features_combination in features_combination_list:

        print('{}: Start creating MEMM for features : {}'.format(time.asctime(time.localtime(time.time())),
                                                                 features_combination))
        logging.info('{}: Start creating MEMM for features : {}'.format(time.asctime(time.localtime(time.time())),
                                                                        features_combination))
        train_start_time = time.time()
        memm_class = MEMM(directory, train_file_to_use, features_combination)

        logging.info('{}: Finish MEMM for features : {}'.format(time.asctime(time.localtime(time.time())),
                                                                features_combination))
        print('{}: Finish MEMM for features : {}'.format(time.asctime(time.localtime(time.time())),
                                                         features_combination))

        print('{}: Start gradient for features : {} and lambda: {}'.
              format(time.asctime(time.localtime(time.time())), features_combination, lamda))
        logging.info('{}: Start gradient for features : {} and lambda: {}'.
                     format(time.asctime(time.localtime(time.time())), features_combination, lamda))
        gradient_class = Gradient(model=memm_class, lambda_value=lamda)
        gradient_result = gradient_class.gradient_descent()

        train_run_time = (time.time() - train_start_time) / 60.0
        print('{}: Finish gradient for features : {} and lambda: {}. run time: {}'.
              format(time.asctime(time.localtime(time.time())), features_combination, lamda, train_run_time))
        logging.info('{}: Finish gradient for features : {} and lambda: {}. run time: {}'.
                     format(time.asctime(time.localtime(time.time())), features_combination, lamda, train_run_time))

        weights = gradient_result.x
        #   np.savetxt(gradient_file, weights, delimiter=",")

        viterbi_start_time = time.time()
        print('{}: Start viterbi'.format((time.asctime(time.localtime(time.time())))))
        viterbi_class = viterbi(memm_class, data_file=test_file_to_use, w=weights)
        viterbi_result = viterbi_class.viterbi_all_data
        viterbi_run_time = (time.time() - viterbi_start_time) / 60.0
        print('{}: Finish viterbi. run time: {}'.format((time.asctime(time.localtime(time.time()))), viterbi_run_time))
        logging.info('{}: Finish viterbi. run time: {}'.format((time.asctime(time.localtime(time.time()))),
                                                               viterbi_run_time))

        write_file_name = datetime.now().strftime(directory + 'file_results/result_MEMM_most_common_tags_' + test_type +
                                                  '%d_%m_%Y_%H_%M.wtag')
        confusion_file_name = datetime.now().strftime(directory + 'confusion_files/CM_MEMM_most_common_tags_' + test_type +
                                                      '%d_%m_%Y_%H_%M.xls')
        evaluate_class = Evaluate(memm_class, test_file_to_use, viterbi_result, write_file_name,
                                  confusion_file_name, comp=comp)
        if not comp:
            word_results_dictionary = evaluate_class.run()
        if comp:
            evaluate_class.write_result_doc()
        logging.info('{}: The model hyper parameters: \n lambda:{} \n test file: {} \n train file: {}'
                     .format(time.asctime(time.localtime(time.time())), lamda, test_file_to_use, train_file_to_use))
        logging.info('{}: Related results files are: \n {} \n {}'.
                     format(time.asctime(time.localtime(time.time())), write_file_name, confusion_file_name))

        # print(word_results_dictionary)
        summary_file_name = '{0}analysis/summary_{1}_{2.day}_{2.month}_{2.year}_{2.hour}_{2.minute}.csv' \
            .format(directory, test_type, datetime.now())
        evaluate_class.create_summary_file(lamda, features_combination, test_file_to_use, train_file_to_use,
                                           summary_file_name, gradient_class.file_name, comp)

        logging.info('{}: Following Evaluation results for features {}'.
                     format(time.asctime(time.localtime(time.time())), features_combination))
        if not comp:
            logging.info('{}: Evaluation results are: \n {} \n'.format(time.asctime(time.localtime(time.time())),
                                                                       word_results_dictionary))
        logging.info('-----------------------------------------------------------------------------------')
Пример #26
0
def load_model():
    s = Summarizer()
    e = Evaluate()
    return s, e
Пример #27
0
Created on Tue Jun 16 17:57:09 2015

@author: Paco
"""

from utils import Utils
from evaluate import Evaluate
from metrics import Metrics

# Load data
u = Utils()
train_hard = u.load_matrix('data/data_train_difficile.mat')

#generate pairs
pairs_idx, pairs_label = u.generate_pairs(train_hard['label'], 1000, 0.1)

# Calculate distance
m = Metrics()
dist = m.braycurtis_dist(train_hard['X'], pairs_idx)

# Evaluate model
e = Evaluate()
e.evaluation(pairs_label,dist)
# display results
e.display_roc()
e.hard_score()

# Evaluate test dataset and save it
test_hard = u.load_matrix('data/data_test_difficile.mat')
dist_test = m.braycurtis_dist(test_hard['X'], test_hard['pairs'])
u.save_test(dist_test,filetxt='soumission_dur.txt')   
Пример #28
0
def trainer(epochs, model, optimizer, scheduler, train_dataloader,
            test_dataloader, batch_train, batch_test, device):

    max_grad_norm = 1.0
    train_loss_set = []

    for e in trange(epochs, desc="Epoch"):

        while gc.collect() > 0:
            pass

        # Training
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()

        # if e > 8:
        #     model.freeze_bert()

        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_adj, b_adj_mwe, b_labels, b_target_idx, _ = batch

            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            ### For BERT + GCN and MWE
            loss = model(b_input_ids.to(device), adj=b_adj, adj_mwe=b_adj_mwe ,attention_mask=b_input_mask.to(device), \
                        labels=b_labels, batch=batch_train, target_token_idx=b_target_idx.to(device))

            train_loss_set.append(loss.item())
            # Backward pass
            loss.backward(retain_graph=True)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss / nb_tr_steps))

        # Validation

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        all_preds = torch.FloatTensor()
        all_labels = torch.LongTensor()
        test_indices = torch.LongTensor()

        # Evaluate data for one epoch
        for batch in test_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_adj, b_adj_mwe, b_labels, b_target_idx, test_idx = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                ### For BERT + GCN and MWE
                logits = model(b_input_ids.to(device), adj=b_adj, adj_mwe=b_adj_mwe, attention_mask=b_input_mask.to(device), \
                               batch=batch_test, target_token_idx=b_target_idx.to(device))

                # Move logits and labels to CPU
                logits = logits.detach().cpu()
                label_ids = b_labels.cpu()
                test_idx = test_idx.cpu()

                all_preds = torch.cat([all_preds, logits])
                all_labels = torch.cat([all_labels, label_ids])
                test_indices = torch.cat([test_indices, test_idx])

    scores = Evaluate(all_preds, all_labels)
    print('scores.accuracy()={}\nscores.precision_recall_fscore()={}'.format(
        scores.accuracy(), scores.precision_recall_fscore()))

    return scores, all_preds, all_labels, test_indices
Пример #29
0
class Segmenter(object):
    def __init__(self, hdfs_client, flags):
        self.train_is_alive = False
        self.hdfs_client = hdfs_client
        self.flags = flags
        self.data_utils = DataUtils()

    def update_config(self):
        config_path = os.path.join(self.flags.raw_data_path, 'config.json')
        try:
            with open(config_path, encoding='utf-8', mode='r') as data_file:
                config_json = json.load(data_file)
                if 'use_lstm' in config_json:
                    self.flags.use_lstm = config_json['use_lstm']
                elif 'use_dynamic_rnn' in config_json:
                    self.flags.use_dynamic_rnn = config_json['use_dynamic_rnn']
                elif 'use_bidirectional_rnn' in config_json:
                    self.flags.use_bidirectional_rnn = config_json[
                        'use_bidirectional_rnn']
                elif 'vocab_drop_limit' in config_json:
                    self.flags.vocab_drop_limit = config_json[
                        'vocab_drop_limit']
                elif 'batch_size' in config_json:
                    self.flags.batch_size = config_json['batch_size']
                elif 'num_steps' in config_json:
                    self.flags.num_steps = config_json['num_steps']
                elif 'num_layer' in config_json:
                    self.flags.num_layer = config_json['num_layer']
                elif 'embedding_size' in config_json:
                    self.flags.embedding_size = config_json['embedding_size']
                elif 'learning_rate' in config_json:
                    self.flags.learning_rate = config_json['learning_rate']
                elif 'learning_rate_decay_factor' in config_json:
                    self.flags.learning_rate_decay_factor = config_json[
                        'learning_rate_decay_factor']
                elif 'keep_prob' in config_json:
                    self.flags.keep_prob = config_json['keep_prob']
                elif 'clip_norm' in config_json:
                    self.flags.clip_norm = config_json['clip_norm']
        except:
            raise Exception('ERROR: config.json content invalid')

    def train(self):
        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path, 'train.txt'),
            os.path.join(self.flags.datasets_path, 'train.txt'))
        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'test.txt'))

        self.data_utils.label_segment_file(
            os.path.join(self.flags.datasets_path, 'train.txt'),
            os.path.join(self.flags.datasets_path, 'label_train.txt'))
        self.data_utils.label_segment_file(
            os.path.join(self.flags.datasets_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'label_test.txt'))

        self.data_utils.split_label_file(
            os.path.join(self.flags.datasets_path, 'label_train.txt'),
            os.path.join(self.flags.datasets_path, 'split_train.txt'))
        self.data_utils.split_label_file(
            os.path.join(self.flags.datasets_path, 'label_test.txt'),
            os.path.join(self.flags.datasets_path, 'split_test.txt'))

        words_vocab, labels_vocab = self.data_utils.create_vocabulary(
            os.path.join(self.flags.datasets_path, 'split_train.txt'),
            self.flags.vocab_path, self.flags.vocab_drop_limit)

        train_word_ids_list, train_label_ids_list = self.data_utils.file_to_word_ids(
            os.path.join(self.flags.datasets_path, 'split_train.txt'),
            words_vocab, labels_vocab)
        test_word_ids_list, test_label_ids_list = self.data_utils.file_to_word_ids(
            os.path.join(self.flags.datasets_path, 'split_test.txt'),
            words_vocab, labels_vocab)

        tensorflow_utils = TensorflowUtils()
        tensorflow_utils.create_record(
            train_word_ids_list, train_label_ids_list,
            os.path.join(self.flags.tfrecords_path, 'train.tfrecords'))
        tensorflow_utils.create_record(
            test_word_ids_list, test_label_ids_list,
            os.path.join(self.flags.tfrecords_path, 'test.tfrecords'))

        self.hdfs_client.hdfs_upload(
            self.flags.vocab_path,
            os.path.join(self.flags.output_path,
                         os.path.basename(self.flags.vocab_path)))

        train = Train()
        train.train()

    def upload_tensorboard(self):
        hdfs_tensorboard_path = os.path.join(
            self.flags.output_path,
            os.path.basename(os.path.normpath(self.flags.tensorboard_path)))
        temp_hdfs_tensorboard_path = hdfs_tensorboard_path + '-temp'
        self.hdfs_client.hdfs_upload(self.flags.tensorboard_path,
                                     temp_hdfs_tensorboard_path)
        self.hdfs_client.hdfs_delete(hdfs_tensorboard_path)
        self.hdfs_client.hdfs_mv(temp_hdfs_tensorboard_path,
                                 hdfs_tensorboard_path)

    def log_monitor(self):
        while (self.train_is_alive):
            time.sleep(120)
            self.upload_tensorboard()

    def upload_model(self):
        predict = Predict()
        predict.saved_model_pb()

        hdfs_checkpoint_path = os.path.join(
            self.flags.output_path,
            os.path.basename(os.path.normpath(self.flags.checkpoint_path)))
        hdfs_saved_model_path = os.path.join(
            self.flags.output_path,
            os.path.basename(os.path.normpath(self.flags.saved_model_path)))

        temp_hdfs_checkpoint_path = hdfs_checkpoint_path + '-temp'
        temp_hdfs_saved_model_path = hdfs_saved_model_path + '-temp'

        self.hdfs_client.hdfs_upload(self.flags.checkpoint_path,
                                     temp_hdfs_checkpoint_path)
        self.hdfs_client.hdfs_upload(self.flags.saved_model_path,
                                     temp_hdfs_saved_model_path)

        self.hdfs_client.hdfs_delete(hdfs_checkpoint_path)
        self.hdfs_client.hdfs_delete(hdfs_saved_model_path)

        self.hdfs_client.hdfs_mv(temp_hdfs_checkpoint_path,
                                 hdfs_checkpoint_path)
        self.hdfs_client.hdfs_mv(temp_hdfs_saved_model_path,
                                 hdfs_saved_model_path)

    def evaluate(self):
        shutil.rmtree(self.flags.vocab_path)
        shutil.rmtree(self.flags.checkpoint_path)

        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path,
                         os.path.basename(self.flags.vocab_path)),
            self.flags.vocab_path)
        self.hdfs_client.hdfs_download(
            os.path.join(self.flags.input_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'test.txt'))
        hdfs_checkpoint_path = os.path.join(
            self.flags.input_path,
            os.path.basename(self.flags.checkpoint_path))
        self.hdfs_client.hdfs_download(hdfs_checkpoint_path,
                                       self.flags.checkpoint_path)

        self.data_utils.label_segment_file(
            os.path.join(self.flags.datasets_path, 'test.txt'),
            os.path.join(self.flags.datasets_path, 'label_test.txt'))
        self.data_utils.split_label_file(
            os.path.join(self.flags.datasets_path, 'label_test.txt'),
            os.path.join(self.flags.datasets_path, 'split_test.txt'))

        predict = Predict()
        predict.file_predict(
            os.path.join(self.flags.datasets_path, 'split_test.txt'),
            os.path.join(self.flags.datasets_path, 'test_predict.txt'))

        self.model_evaluate = Evaluate()
        self.model_evaluate.evaluate(
            os.path.join(self.flags.datasets_path, 'test_predict.txt'),
            os.path.join(self.flags.datasets_path, 'test_evaluate.txt'))

        self.hdfs_client.hdfs_delete(
            os.path.join(self.flags.output_path, 'test_evaluate.txt'))
        self.hdfs_client.hdfs_upload(
            os.path.join(self.flags.datasets_path, 'test_evaluate.txt'),
            os.path.join(self.flags.input_path, 'test_evaluate.txt'))
Пример #30
0
            p2 = os.path.join(path, "a-" + file)
            al = align.face_features(p, p2)
            ev = utils.parse_evaluate(al, args.parsing_checkpoint, cuda=cuda)
            p = os.path.join(path, "b-" + file)
            cv2.imwrite(p, ev)
            ev = 255 - utils.img_edge(ev)
            p = os.path.join(path, "c-" + file)
            cv2.imwrite(p, ev)
    elif args.phase == "dataset":
        dataset = FaceDataset(args, "test")
        dataset.pre_process(cuda)
    elif args.phase == "preview":
        log.info("preview picture")
        path = "../export/regular/model.jpg"
        img = cv2.imread(path)
        img2 = utils.parse_evaluate(img, args.parsing_checkpoint, cuda)
        img3 = utils.img_edge(img2)
        img3_ = ops.fill_grey(img3)
        img4 = align.face_features(path)
        log.info("{0} {1} {2} {3}".format(img.shape, img2.shape, img3_.shape,
                                          img4.shape))
        ops.merge_4image(img, img2, img3_, img4, show=True)
    elif args.phase == "evaluate":
        log.info("evaluation mode start")
        evl = Evaluate(args, cuda=cuda)
        img = cv2.imread(args.eval_image).astype(np.float32)
        x_ = evl.itr_train(img)
        evl.output(x_, img)
    else:
        log.error("not known phase %s", args.phase)
Пример #31
0
                inp[el] = self.means[0, el]

        # convert the input list to a numpy matric and normalise it
        inp = np.matrix([inp])
        inp = (inp - self.means) / self.std

        # get a result and prediction using the logistic function
        result = (self.coef * inp.T)[0, 0] + self.bias

        prediction = 1.0 / (1 + np.exp(-result))

        assert prediction <= 1 and prediction >= 0
        return prediction


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

if __name__ == "__main__":
    m = LogReg()
    predictions = m.make_prediction_set(
        2000,
        2016,
        "2000-2016 (it9)",
        testStart=2015,
        slamsOnly=False,
        trainName='LogReg_final',
        inputProcessedFile="2000-2016 (it9)-processed")
    e = Evaluate(predictions)
    print('Final log reg model')
    e.display_summary()
Пример #32
0
        ftext.close()
    f.close()
    furl.close()
    
    res = evaluator.evaluateFC(rp)
    writeEvaluation(res,evalFilename)    
    print sum(res)
    print len(res)

if __name__ == "__main__":
    
    seedsFiles=['Output-CharlestonShooting.txt','seeds-Sandra.txt','Output-tunisiaHotelAttack.txt','Output-samesexmarriage.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','seeds_459.txt','seeds_474.txt','seedsURLs_z_534.txt','seedsURLs_z_501.txt','seedsURLs_z_540.txt']
    posFiles = ['charlestonShootingPos.txt','evaluate-SandraBland.txt','pos-tunisiaHotelAttack.txt','pos-samesexmarriage.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt','pos-sydneyseige.txt','pos-Charlie.txt']
    negFiles = ['charlestonShootingNeg.txt','neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt','neg-sydneyseige.txt','neg-Charlie.txt']
    modelFiles = ['Output-CharlestonShooting.txt','model-SandraBland.txt','model-tunisiaHotelAttack.txt','model-samesexmarriage.txt','model-CharlestonShooting.txt']
    evaluator = Evaluate()
    #for i in range(3):
    pagesLimit = 300

    seedsFiles=['seeds-Sandra.txt','Output-tunisiaHotelAttack.txt','Output-samesexmarriage.txt','Output-CharlestonShooting.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','seeds_459.txt','seeds_474.txt','seedsURLs_z_534.txt','seedsURLs_z_501.txt','seedsURLs_z_540.txt']
    posFiles = ['evaluate-SandraBland.txt','pos-tunisiaHotelAttack.txt','pos-samesexmarriage.txt','pos-CharlestonShooting.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt','pos-sydneyseige.txt','pos-Charlie.txt']
    negFiles = ['neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt','neg-sydneyseige.txt','neg-Charlie.txt']
    modelFiles = ['model-SandraBland.txt','model-tunisiaHotelAttack.txt','model-samesexmarriage.txt','model-CharlestonShooting.txt']
    evaluator = Evaluate()
    #for i in range(3):
    pagesLimit = 100

    noK = 5
    pageTh = 0.2
    urlsTh = 0
    i=0
Пример #33
0
from board import Board, PieceStack, Turn, get_piece_text, EMPTY
from evaluate import Evaluate, WIN
from random import randint
from search import RootOfAlphaBetaSearch

piecestack = PieceStack()

turn = Turn()

board = Board()

evaluate = Evaluate()


def UserTurn(piecestack, board, piece):

    board.show()
    piecestack.show()

    piecestack.TakePiece(piece)

    print('Piece: {0}'.format(get_piece_text(piece)))
    while True:
        x, y = [
            int(i) - 1 for i in raw_input(
                "Enter x y coordinates to place piece: ").split()
        ]
        if board.pieces[x][y] is EMPTY:
            break
        else:
            print('Square is not empty. Try another one.')
Пример #34
0
def classifier_test():

    # setting
    dataset_para = '[email protected]@partition@selection'
    # 选择使用哪些特征
    feature_para = (1, 2, 3, 4)

    # file directory

    # feature_dir = dataset_dir + r'\feature1'

    # 预处理:从原始数据yoochoose-data中提取出实验数据所需要部分数据(根据实验数据session进行提取)
    # 输入1:(实验数据)dataset_dir\train\session_item.txt  .\test\session_item.txt
    # 输入2:(yoochoose-data)yoochoose_data_dir\yoochoose-clicks.dat  .\yoochoose-buys.dat  .\yoochoose-test.dat
    # 输出:dataset_dir\yoochoose-selected\yoochoose-clicks-selected.dat  .\yoochoose-buys-selected.dat  .\yoochoose-test-selected.dat
    dataset_dir = r'E:\ranking aggregation\dataset\yoochoose\Full' + '\\' + dataset_para
    yoochoose_data_dir = r'E:\recsyschallenge2015\mycode\yoochoose-data'
    # 输出路径
    yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected'
    # 假如输出文件夹不存在,则创建文件夹
    # if not os.path.exists(yoochoose_selected_dir):
    #     os.makedirs(yoochoose_selected_dir)
    # Preprocess2.extract_data(dataset_dir, yoochoose_data_dir, yoochoose_selected_dir)

    # 提取特征
    # 输入:yoochoose selected data(及groundtruth)
    # 输出:特征
    feature_dir = r'E:\recsyschallenge2015\mycode\result-data'
    # feature_dir = dataset_dir + r'\feature1'
    # 假如输出文件夹不存在,则创建文件夹
    if not os.path.exists(feature_dir):
        os.makedirs(feature_dir)
    print('feature_para:', feature_para)
    Feature4.go(dataset_dir, feature_dir, feature_para)

    # 读取特征
    X_train, y_train = Input2.read_train(feature_dir)
    X_test, y_test, test_dic_data, session_item_data, session_idx_dic = Input2.read_test(
        dataset_dir, feature_dir)

    groundtruth_path = dataset_dir + r'\test\session_item.txt'
    # 模型部分
    print('model: LogisticRegression')
    model = LogisticRegression()
    model.fit(X_train, y_train)
    # print(model)
    # make predictions
    y_predict = model.predict(X_test)
    # 结果评估
    solution = Solution.generate(test_dic_data, y_predict)
    Evaluate.go(solution, groundtruth_path)

    # 模型部分
    print('model: GaussianNB')
    model = GaussianNB()
    model.fit(X_train, y_train)
    # print(model)
    # make predictions
    y_predict = model.predict(X_test)
    # 结果评估
    solution = Solution.generate(test_dic_data, y_predict)
    Evaluate.go(solution, groundtruth_path)

    # 模型部分
    print('model: SVM')
    model = SVC()
    model.fit(X_train, y_train)
    # print(model)
    # make predictions
    y_predict = model.predict(X_test)
    # 结果评估
    solution = Solution.generate(test_dic_data, y_predict)
    Evaluate.go(solution, groundtruth_path)
Пример #35
0
class DataHandler:
    #클래스 멤버: 연산기 하나
    evaluator = Evaluate()

    #class method : 전역함수처럼 쓸 수 있다
    @classmethod
    def GetRawdataInDic(cls, filename):
        rawdata = {}
        with open(filename, 'rb') as f:
            while 1:
                try:
                    data = pickle.load(f)
                except EOFError:
                    break

                rawdata.update(data)

        return rawdata

    def __init__(self, filename, clsname):
        self.rawdata = DataHandler.GetRawdataInDic(filename)
        self.clsname = clsname

        #연산한 값을 저장해두는 저장소
        #필요할 떄 연산하되, 이미 연산된 값이면 연산없이 저장된 값을 반환
        self.cache = {}

    def get_scores(self):
        if 'scores' not in self.cache:
            self.cache['scores'] = list(self.rawdata.values())
        return self.cache.get('scores')

    #cache
    def get_average(self):
        if 'average' not in self.cache:
            self.cache['average'] = self.evaluator.average(self.get_scores())
        return self.cache.get('average')

    def get_variance(self):
        if 'variace' not in self.cache:
            vari = round(
                self.evaluator.variance(self.get_scores(), self.get_average()))
            self.cache['variance'] = vari
        return self.cache.get('variance')

    def get_standard_deviation(self):
        if "standard_deviation" not in self.cache:
            std_dev = round(math.sqrt(self.get_variance()), 1)
            self.cache["standard_deviation"] = std_dev
            return self.cache.get("standard_deviation")

    def WhoIsHighest(self):
        if 'highest' not in self.cache:
            self.cache['highest'] = reduce(
                lambda a, b: a
                if self.rawdata.get(a) > self.rawdata.get(b) else b,
                self.rawdata.keys())
        return self.cache.get('highest')

    def GetHighestScore(self):
        return self.rawdata[self.WhoIsHighest()]

    def WhoIsLowest(self):
        if "lowest" not in self.cache:
            self.cache['lowest'] = reduce(
                lambda a, b: a
                if self.rawdata.get(a) < self.rawdata.get(b) else b,
                self.rawdata.keys())
        return self.cache.get('lowest')

    def GetLowestScore(self):
        return self.rawdata[self.WhoIsLowest()]

    def get_evaluation(self):
        print('*' * 50)
        print("%s 반 성적 분석 결과" % self.clsname)
        print("{0}반의 평균은 {1}점이고 분산은 {2}이며,따라서 표준편차는{3}이다".\
              format(self.clsname, self.get_average(), self.get_variance()\
                     , self.get_standard_deviation()))
        print('*' * 50)
        print("%s 반 종합 평가" % self.clsname)
        print('*' * 50)
        self.evaluateClass()

    def evaluateclass(self):
        avrg = self.get_average()
        std_dev = self.get_standard_deviation()

        if avrg < 50 and std_dev > 20:
            print("성적이 너무 저조하고 학생들의 실력 차이가 너무 크다.")
        elif avrg > 50 and std_dev > 20:
            print("성적은 평균이상이지만 학생들 실력 차이가 크다. 주의 요망!")
        elif avrg < 50 and std_dev < 20:
            print("학생들간 실력차는 나지 않으나 성적이 너무 저조하다. 주의 요망!")
        elif avrg > 50 and std_dev < 20:
            print("성적도 평균 이상이고 학생들의 실력차도 크지 않다.")
Пример #36
0
def main():
    # action space
    actionSpace = [[10, 0], [7, 7], [0, 10], [-7, 7], [-10, 0], [-7, -7], [0, -10], [7, -7]]
    numActionSpace = len(actionSpace)

    # state space
    numStateSpace = 4
    xBoundary = [0, 360]
    yBoundary = [0, 360]
    checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary)

    initSheepPositionMean = np.array([180, 180])
    initWolfPositionMean = np.array([180, 180])
    initSheepPositionNoise = np.array([120, 120])
    initWolfPositionNoise = np.array([60, 60])
    sheepPositionReset = ag.SheepPositionReset(initSheepPositionMean, initSheepPositionNoise, checkBoundaryAndAdjust)
    wolfPositionReset = ag.WolfPositionReset(initWolfPositionMean, initWolfPositionNoise, checkBoundaryAndAdjust)

    numOneAgentState = 2
    positionIndex = [0, 1]

    sheepPositionTransition = ag.SheepPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust)
    wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust)

    numAgent = 2
    sheepId = 0
    wolfId = 1
    transitionFunction = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset,
                                                sheepPositionTransition, wolfPositionTransition)
    minDistance = 15
    isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance)

    screen = pg.display.set_mode([xBoundary[1], yBoundary[1]])
    screenColor = [255, 255, 255]
    circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50],
                       [50, 50, 50], [50, 50, 50], [50, 50, 50]]
    circleSize = 8
    saveImage = False
    saveImageFile = 'image'
    render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize,
                        saveImage, saveImageFile)

    aliveBouns = -1
    deathPenalty = 20
    rewardDecay = 0.99
    rewardFunction = reward.TerminalPenalty(sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal)
    accumulateRewards = PG.AccumulateRewards(rewardDecay, rewardFunction)

    maxTimeStep = 150
    sampleTrajectory = PG.SampleTrajectory(maxTimeStep, transitionFunction, isTerminal)

    approximatePolicy = PG.ApproximatePolicy(actionSpace)
    trainPG = PG.TrainTensorflow(actionSpace)

    numTrajectory = 20
    maxEpisode = 1000

    # Generate models.
    learningRate = 1e-4
    hiddenNeuronNumbers = [128, 256, 512, 1024]
    hiddenDepths = [2, 4, 8]
    # hiddenNeuronNumbers = [128]
    # hiddenDepths = [2]
    generateModel = GeneratePolicyNet(numStateSpace, numActionSpace, learningRate)
    models = {(n, d): generateModel(d, round(n / d)) for n, d in it.product(hiddenNeuronNumbers, hiddenDepths)}
    print("Models generated")

    # Train.
    policyGradient = PG.PolicyGradient(numTrajectory, maxEpisode, render)
    trainModel = lambda model: policyGradient(model, approximatePolicy,
                                                             sampleTrajectory,
                                                             accumulateRewards,
                                                             trainPG)
    trainedModels = {key: trainModel(model) for key, model in models.items()}
    print("Finished training")

    # Evaluate
    modelEvaluate = Evaluate(numTrajectory, approximatePolicy, sampleTrajectory, rewardFunction)
    meanEpisodeRewards = {key: modelEvaluate(model) for key, model in trainedModels.items()}
    print("Finished evaluating")
    # print(meanEpisodeRewards)

    # Visualize
    independentVariableNames = ['NeuroTotalNumber', 'layerNumber']
    draw(meanEpisodeRewards, independentVariableNames)
    print("Finished visualizing", meanEpisodeRewards)
Пример #37
0
        axs[0].grid()
        axs[0].set_title('Loss')
        axs[1].plot(history['Train_dice'], label='Train Dice')
        axs[1].plot(history['Valid_dice'], label='Valid Dice')
        axs[1].legend()
        axs[1].grid()
        axs[1].set_title('Dice')
        plt.savefig('../output/loss_dice.png')

    ########################################################################
    # Evaluate the network
    # get all predictions of the validation set: maybe a memory error here.
    if args.load_mod:
        # load the best model
        net.load_state_dict(torch.load(MODEL_FILE))
        eva = Evaluate(net, device, validloader, args, isTest=False)
        eva.search_parameter()
        dice, dicPred, dicSubmit = eva.predict_dataloader()
        # eva.plot_sampled_predict()

        # evaluate the prediction
        sout = '\nFinal Dice {:.3f}\n'.format(dice) +\
         '==============Predict===============\n' + \
         analyze_labels(pd.DataFrame(dicPred)) # +\
        #	'==============True===============\n' + \
        #	analyze_labels(stat_df_valid)
        # print(sout)
        # print2file(sout, LOG_FILE)
        # print2file(' '.join(str(key)+':'+str(val) for key,val in eva.dicPara.items()), LOG_FILE)

        # load swa model
Пример #38
0
	def __init__(self, n):
		self.data = DataManager('../data/train.csv','../data/test.csv', n)
		self.fe = FeatureExtractor(self.data)
		self.eval = Evaluate()
Пример #39
0
 def playit(self):
     point = 0
     evaluation_ai = Evaluate(self.board, True)
     aiMoves = evaluation_ai.checkPossibleMoves()
     depth = 0
     print(aiMoves[0][0][0])
Пример #40
0
import numpy as np
import json

gen_data = False
plot_fig = True

basename = 'rmse'
jsonfile = basename + '.json'
print('gen_data/plot_fig/filename = ', gen_data, plot_fig, basename)

mem_list = [6, 10]
loc_list = np.arange(5, 60, 5).tolist()
wgt_list = np.arange(0, 10, 1).tolist()

if gen_data:
    ev = Evaluate()
    truth = ev.get_data('output_truth.txt')

    error = {}
    rmsedata = {}
    amedata = {}
    for mem in mem_list:
        rmseloc = {}
        ameloc = {}
        for loc in loc_list:
            rmse = []
            ame = []
            for wgt in wgt_list:
                filename = 'h3dw%2.2dl%2.2dm%2.2d/output_analy.txt' % (
                    wgt, loc, mem)
                # analysis
            optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)

        print(model)
        print(model_fname)

        # # train the model
        # for param in model.parameters():
        #     param.requires_grad = True

        model.train()
        train_loop()

    # evaluate the model
    if args.evaluate:
        if args.evaluate_on_cpu:
            device = "cpu"

        model = model.to(device)
        model.eval()

        if args.train:
            Evaluate(model, test_loader, outpath, args.target, device, args.n_epochs)
        elif args.load:
            Evaluate(model, test_loader, outpath, args.target, device, args.load_epoch)

## -----------------------------------------------------------
# # to retrieve a stored variable in pkl file
# import pickle
# with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
#     a = pickle.load(f)
Пример #42
0
class StanceDetector:
	def __init__(self, n):
		self.data = DataManager('../data/train.csv','../data/test.csv', n)
		self.fe = FeatureExtractor(self.data)
		self.eval = Evaluate()

	def buildBaseline(self, model):
		print 'Training baseline',model
		feats = ['words']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		for mode in ['simple','tfidf']:
			if model=='bayes':
				cl = MultinomialNB()
			elif model=='svm':
				cl = LinearSVC()

			if mode=='tfidf':
				cl = Pipeline([('tfidf', TfidfTransformer()),
					  ('clf', cl), ])

			clf = cl.fit(X, y)
			y_pred = clf.predict(X_test)
			print mode, accuracy_score(y_true, y_pred)
			pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))


	def buildSimple(self, model):
		feats = ['topicVecs','words2vec']
		print feats
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		for mode in ['simple']:#,'tfidf']:
			if model=='bayes':
				cl = MultinomialNB()
			elif model=='svm':
				# cl = LinearSVC()
				cl = LinearSVC()
				cl = GridSearchCV(cl, self.getGridSearchParams())

			if mode=='tfidf':
				cl = Pipeline([('tfidf', TfidfTransformer()),
					  ('clf', cl), ])
			
			clf = cl.fit(X, y)
			# print cl.best_params_
			y_pred = clf.predict(X_test)
			print mode, accuracy_score(y_true, y_pred)
			pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	#train in name means helper function
	def trainSVC(self, feats, y_attribute, proba=False):
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		
		clf = SVC(probability=proba)
		clf = clf.fit(X,y)
		if proba:
			y_proba = clf.predict_proba(X_test)
			return clf, y_proba
		else:
			y_pr = clf.predict(X_test)
			return clf, y_pr
	
	def trainLinearSVC(self, feats, y_attribute, dec=False):
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		
		clf = LinearSVC()
		clf = clf.fit(X,y)
		if dec:
			y_pr = clf.decision_function(X_test)
			return clf, y_pr
		else:
			y_pr = clf.predict(X_test)
			return clf, y_pr

	#TODO: revisit
	#check lable transform encodings of NONE, FAVOR, AGAINST
	# def buildTopicStanceSeparate(self):
	# 	feats = ['words']
	# 	y_attribute = 'stance'
	# 	X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)

	# 	#builds two separate for topic and stance
	# 	topic_clf, y_topic_proba = self.trainLinearSVC(feats = ['words','lexiconsbyword'],y_attribute = 'topic',dec=True)
		
	# 	#WRONR
	# 	#WRONG
	# 	#WRONG
	# 	boost_factors = np.ones_like(y_true)
	# 	#multiply by NONE (0) = 0
	# 	#multiply by FAVOR (1) = 1
	# 	#multiply by AGAINST (2) = 2

	# 	#has index of class with max prob for each sample
	# 	topic_preds = np.argmax(y_topic_proba,axis=1)
	# 	for ind,s in enumerate(y_topic_proba):
	# 		prob = y_topic_proba[ind][topic_preds[ind]]
	# 		if prob < 0.4:
	# 			boost_factors[ind] = 0 #corresponds to NONE
		
	# 	stance_clf,stance_pred = self.trainLinearSVC(feats = ['words','lexiconsbyword','topic'],y_attribute = 'stance')		
		
	# 	# for i in range(0, len(stance_pred)):
	# 	# 	if boost_factors[i] == 2:
	# 	# 		stance_pred[i] = self.fe.labelenc.transform("NONE")
		
	# 	#with numpy arrays now, above is equivalent to below , right?
	# 	stance_pred = np.multiply(stance_pred, boost_factors)
	# 	stance_pred_labels = self.fe.labelenc.inverse_transform(stance_pred)

	# 	# print [(self.data.testLabels[i], stance_pred_labels[i]) for i in range(len(stance_pred))]
	# 	score = accuracy_score(y_true, stance_pred)
	# 	print score
	# 	pprint(self.eval.computeFscores(self.data.testTweets, stance_pred_labels))

	def buildTopicOnlyMultiple(self):
		#one svm for each topic
		feats = ['words2vec']
		y_attribute = 'stance'
		clf_topic = {}
		for topic in list(self.fe.topicenc.classes_):
			X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute, topic)
			Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute, topic)
			clf = LinearSVC()
			clf = clf.fit(X,y)
			clf_topic[topic] = clf
			print topic, clf.score(Xt,yt)

		# not useful. still less than single SVM. but not as much as avg of above

		# X_whole,y_whole = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		# Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		# newX = []
		# newXt = []
		# for topic in clf_topic:
		# 	newX.append(clf_topic[topic].transform(X_whole))
		# 	newXt.append(clf_topic[topic].transform(Xt))
		# newX = np.concatenate(tuple(newX),axis=1)
		# newXt = np.concatenate(tuple(newXt),axis=1)
		# newclf = LinearSVC()
		# newclf = newclf.fit(newX, y_whole)
		# print newclf.score(newXt, yt)

	def trainTopicSVM(self, topic):
		feats = ['words2vec','clusteredLexicons','topic1hot']
		y_attribute = 'stance'
		
		X,y = self.fe.getFeaturesTopicNontopic('train',feats,y_attribute, topic=topic)
		X_test,y_true = self.fe.getFeaturesTopicNontopic('test',feats,y_attribute, topic=topic)
		clf = LinearSVC()
		clf = GridSearchCV(clf,self.getGridSearchParams())
		clf = clf.fit(X,y)
		print clf.best_params_
		print topic #,clf.score(X_test, y_true)
		return clf
	
	#WRITE
	#WRITE
	#WRITE
	def buildTopicWise(self):
		#separate SVC for each topic, tests on that class only first, then on all
		topic_clf = {}
		feats = ['words2vec','clusteredLexicons','topic1hot']
		
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute)

		#X matrix for new classifier which uses this as train matrix
		#has columns of each topic classifier's confidence function
		# X_fx = []
		# X_ftestx = []
		preds = []
		for topic in list(self.fe.topicenc.classes_):
			topic_clf[topic] = self.trainTopicSVM(topic)
			preds.append(topic_clf[topic].predict(X_test))
			# X_fx.append(topic_clf[topic].decision_function(X))
			# X_ftestx.append(topic_clf[topic].decision_function(X_test))

		allpreds = np.vstack(tuple(preds))
		topic1hot, temp = self.fe.getFeaturesMatrix('test',['topic1hot'],'stance')
		# print allpreds.shape, topic1hot.T.shape
		allpreds[allpreds==5] = 1
		final_pred = np.multiply(topic1hot.T,allpreds)

		prediction = np.sum(final_pred, axis=0).astype(int)
		# X_fx = np.concatenate(tuple(X_fx), axis=1)
		# X_ftestx = np.concatenate(tuple(X_ftestx), axis=1)
		# clf = LinearSVC().fit(X_fx, y)
		# y_pred = clf.predict(X_ftestx)
		print accuracy_score(y_true, prediction)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(prediction)))


	#GOOD 66%acc
	#1.2 % increase with change topic to 1hot
	def buildSVMWord2Vec(self):
		feats = ['words2vec','topic1hot']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		clf = LinearSVC(C=0.01,penalty='l1',dual=False)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	def buildSVMTrial(self):
		feats = ['topic1hot','words2vec']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)		
		clf = LinearSVC(C=0.001)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
	
	def buildTrial(self):
		# feats = ['pos','words2vec','clusteredLexicons','topic1hot']
		# 'givenSentiment','givenOpinion'
		feats = ['words2vec','pos','clusteredLexicons','top1grams','top2grams']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)		
		# clf = DecisionTreeClassifier()
		# clf = LogisticRegression()
		clf = LinearSVC(C=1, class_weight='balanced', penalty='l1',dual=False)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		# print y_pred
		print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]),len(np.where(y_pred>1)[0])
		print len(y_pred)
		print 'training accuracy',clf.score(X, y)
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	def buildGithubSGDModel(self):
		# feats = ['words2vec','topic1hot','pos']
		y_attribute = 'stance'
		dataset = self.fe.getDataset('train')
		dataset2 = self.fe.getDataset('test')
		y_train = self.fe.getY('train',dataset, y_attribute)
		y_test = self.fe.getY('train',dataset2, y_attribute)

		tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=1.0, min_df=1, binary=True, norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=True, encoding='latin1')
		
		X_train = tfidf.fit_transform(self.data.trainTweetsText)
		X_test = tfidf.transform(self.data.testTweetsText)
		tuned_parameters = {'alpha': [10 ** a for a in range(-12, 0)]}
		clf = GridSearchCV(SGDClassifier(loss='hinge', penalty='elasticnet',l1_ratio=0.75, n_iter=10, shuffle=True, verbose=False, n_jobs=4, average=False)
                      , tuned_parameters, cv=10, scoring='f1_weighted')

		clf.fit(X_train, y_train)
		print clf.best_params_
		print("Grid scores on development set:")
		for params, mean_score, scores in clf.grid_scores_:
			print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
		print classification_report(y_test, clf.predict(X_test))
		print clf.score(X_test, y_test)


	def getGridSearchParams(self):
		param_grid = [
				{'C': [0.001, 0.01, 0.1, 1], 'dual':[False, True],'class_weight':['balanced',None]}
		 ]
		return param_grid

	def getGridSearchParamsForXGBoost(self):
		param_grid = [
			{'n_estimators':[10,20,30,40,50], 'max_depth': [1,2,3,4,5]}
		]

	def buildSVMWord2VecWithClusters(self):
		#feats = ['topic1hot']
		#feats = ['words2vec', 'top1grams', 'top2grams']
		#feats = ['words2vec', 'top1grams']
		#feats = ['words2vec', 'top2grams']
		feats = ['words2vec', 'clusteredLexicons', 'topic1hot', 'pos']
		#feats = ['words2vec','topic1hot', 'pos','clusteredLexicons', 'top2grams']
		#feats = ['clusteredLexicons']
		#feats = ['pos']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		print (X.shape)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		clf = LinearSVC(C=1,penalty='l1',dual=False)
		clf = clf.fit(X,y)
		y_pred = clf.predict(Xt)
		# f = open('pred','w')
		# for i in y_pred:
		# 	#print type(i)
		# 	f.write('{0}'.format(i))
		# f.close()
		accuracy = clf.score(Xt, yt)
		# print clf.score(Xt, yt)
		fscores = self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))
		# print type(fscores)
		# print fscores
		# pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		# print (accuracy, fscores['Macro'])
		return (accuracy, fscores['Macro'])

	def buildSVMWord2VecWithClustersGridSearch(self):
		feats = ['words2vec','topic1hot','pos', 'clusteredLexicons']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		
		svmclf = LinearSVC(C=0.01,penalty='l1',dual=False)
		clf = GridSearchCV(svmclf, self.getGridSearchParams())
		clf = clf.fit(X,y)
		print clf.best_params_

		y_pred = clf.predict(Xt)
		
		print clf.score(Xt, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))

	def trainStanceNone(self, feats):
		# feats = ['words2vec','topic1hot','pos']
		X,y = self.fe.getFeaturesStanceNone('train',feats)
		Xt,yt = self.fe.getFeaturesStanceNone('test',feats)
		svmclf = LinearSVC()
		stance_none_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y)
		# print stance_none_clf.score(Xt, yt)
		pred = stance_none_clf.predict(Xt)
		print classification_report(yt, pred)
		return stance_none_clf

	def trainFavorAgainst(self,feats):
		# feats = ['words2vec','topic1hot','pos']
		X,y = self.fe.getFeaturesFavorAgainst('train',feats)
		Xt,yt = self.fe.getFeaturesFavorAgainst('test',feats)
		svmclf = LinearSVC()
		fav_agnst_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y)
		pred = fav_agnst_clf.predict(Xt)
		print classification_report(yt, pred)

		# print fav_agnst_clf.score(Xt, yt)
		return fav_agnst_clf

	def buildModel2(self):
		#one SVM for Stance/None and other for Favor/Against
		feats = ['words2vec','topic1hot','pos']
		print feats
		stance_none_clf = self.trainStanceNone(feats)
		fav_agnst_clf = self.trainFavorAgainst(feats)
		X_test,y_true = self.fe.getFeaturesMatrix('test',feats,'stance')
		st_pred = stance_none_clf.predict(X_test)
		favaga_pred = fav_agnst_clf.predict(X_test)
		for index,row in enumerate(st_pred):
			if row==3:
				st_pred[index] = favaga_pred[index]
		print classification_report(y_true, st_pred)
		print accuracy_score(y_true, st_pred)
		# assert(stance_none_clf.classes_[1]==3) #stance(3)
		# # >0 means this class - stance will be predicted
		# # <0 means none is predicted
		# confi = stance_none_clf.decision_function(X_test)
		# # treat as confident about none if confi<-0.25:
		# y_pred = fav_agnst_clf.predict(X_test)
		# print accuracy_score(y_true, y_pred)
		# pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		# threshold = -0.25
		# confi_high = np.where(confi<threshold)[0]
		# for loc in confi_high:
		# 	y_pred[loc] = self.fe.labelenc.transform('NONE')
		# print 'Boosted', accuracy_score(y_true, y_pred)
		# print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]), len(np.where(y_pred==2)[0]),
		# pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		
	def get_proba_one(self, model, X):
	    predicted = model.predict_proba(X)
	    return predicted[:, 1]

	def runXGBoostModel(self,model, model_name, X, target, X_test, crossOn):
		print "Trying to fit model"
		print X.shape, target.shape
		model.fit(X, target)
		print "Successfully fit model"
		predicted = self.get_proba_one(model, X)
		predicted_test = self.get_proba_one(model, X_test)
		predicted_test = model.predict(X_test)
		print predicted_test
		return predicted_test


	def word2VecXGBoost(self):
		feats = ['words2vec','pos','clusteredLexicons', 'top1grams','top2grams', 'topic1hot' ]
		#feats = ['words2vec']
		#feats = ['clusteredLexicons']
		#feats = ['pos']
		y_attribute = 'stance'
		X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute)
		print (X.shape)
		Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute)
		#clf = LinearSVC(C=0.01,penalty='l1',dual=False)
		#clf = clf.fit(X,y)
		#y_pred = clf.predict(Xt)
		# f = open('pred','w')
		# for i in y_pred:
		# 	#print type(i)
		# 	f.write('{0}'.format(i))
		# f.close()
		#print clf.score(Xt, yt)
		#pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)))
		m2_xgb = xgb.XGBClassifier(n_estimators=10, nthread=-1, max_depth = 2 	, seed=500)
		#m2_xgb = GridSearchCV(m2_xgb, self.getGridSearchParamsForXGBoost())
		print "Run Model"
		y_pred = self.runXGBoostModel(m2_xgb, "m2_xgb_OS_ENN", X, y, Xt, True)
		# print type(yt)
		# print type(y_pred)
		# print len(yt)
		# print len(y_pred)
		# print yt.shape
		# print y_pred.shape
		# print yt
		# print y_pred
		# print(m2_xgb)
		print accuracy_score(yt, y_pred)

	def buildModel3(self):
		#feats = [['words2vec'],['pos'],['clusteredLexicons']]
		feats = [['words2vec'],['pos'],['clusteredLexicons']]
		y_attribute = 'stance'
		y_pred = []
		y_t = []
		for f in feats:
			X,y = self.fe.getFeaturesMatrix('train',f,y_attribute)
			Xt,yt = self.fe.getFeaturesMatrix('test',f,y_attribute)
			clf = SVC(C=1, probability=True)
			clf = clf.fit(X,y)
			train_transform = clf.predict_log_proba(X)
			test_transform = clf.predict_log_proba(Xt)
			# print 'Train transform ',train_transform.shape
			# print 'Test transform ',test_transform.shape
			y_pred.append(train_transform)
			y_t.append(test_transform)
		#y_pred_h = np.hstack(tuple(y_pred))
		#y_t_h = np.hstack(tuple(y_t))
		x = 0
		for i in y_pred:
			   x += i
		y_pred_h = x
		x = 0
		for i in y_t:
			   x += i
		y_t_h = x
		# print type(y_pred_h)
		# print y_pred_h[0]
		# print y_pred_h.shape
		regr = linear_model.LogisticRegression()
		regr.fit(y_pred_h, y)
		final_pred = regr.predict(y_t_h)
		print accuracy_score(final_pred, yt)
		pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(final_pred)))