Exemplo n.º 1
0
def main(num_epochs=100, n_splits=5):
    data_util = DataUtil('data', 'spectrogram_data')
    X, y = data_util.get_data()
    kf = KFold(n_splits=n_splits, shuffle=True)
    test_accuracy_sum = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = Model(data_util.height, data_util.width)

        param_values, threshold = train_and_validate(model, X_train, y_train, num_epochs)
        model.set_param_values(param_values)

        test_accuracy_sum += perform_validation(model, X_test, y_test, threshold)
    print("Cross-validation results:")
    print("  accuracy:\t\t{:.2f} %".format(test_accuracy_sum/n_splits * 100))
Exemplo n.º 2
0
        sys.exit(1)

    if len(sys.argv) > 2:
        layer_arg = int(sys.argv[2])
    else:
        layer_arg = 2

    if len(sys.argv) > 3:
        ep_arg = int(sys.argv[3])
    else:
        ep_arg = 20

    # Read the data
    print ">> Initializing data..."
    reader = DataUtil(WORDVEC_FILEPATH, TAGGED_NEWS_FILEPATH)
    X, Y = reader.get_data()
    print X.shape
    print Y.shape

    # Train the model
    print ">> Training model... epochs = {0}, layers = {1}".format(
        ep_arg, layer_arg)
    nermodel = NERModel(reader)
    nermodel.train(epochs=ep_arg, layers=layer_arg)

    # Evaluate the model
    print ">> Evaluating model..."
    nermodel.evaluate()

    # Save the model
    print ">> Saving model..."
    def train(self):
        self.system_setting.unlock_model_training()
        if self.system_setting.islock_model_training():
            logging.warn("model training  is locked.!!!")
            return

        try:

            self.system_setting.lock_model_training()
            logging.info("model training locked.....")
            # 训练模型
            logging.info("********************************")
            logging.info("training classifier...........")
            dataUtil = DataUtil("articles_testN")
            origin_data = dataUtil.get_data()
            if origin_data is None:
                logging.warn("training data is NULL!!!")
                return

            wordVec = Word2Vector()
            embeddings = np.array(wordVec.embeddings)
            adam = Adagrad(lr=0.01, epsilon=1e-06)
            bestF = 0
            bestAcc = 0
            bestPre = 0
            bestRecall = 0

            for flod in range(self.flods):
                # 10 折中选出一个最好的
                model = self.model(embeddings)
                model.compile(loss='binary_crossentropy',
                              metrics=[ut.f_score],
                              optimizer=adam)

                data = dataUtil.flod_cross_data(origin_data)
                test_data = data["test_data"]
                dev_data = data["dev_data"]
                train_data = data["train_data"]
                test_data["processed_content"] = sequence.pad_sequences(test_data['processed_content'],\
                       maxlen=self.content_max_len,  padding='post', truncating='post')
                dev_data["processed_content"] = sequence.pad_sequences(dev_data["processed_content"], \
                     maxlen=self.content_max_len, padding='post', truncating='post')
                train_data["processed_content"] = sequence.pad_sequences(train_data["processed_content"],\
                         maxlen=self.content_max_len, padding='post', truncating='post')


                test_data["processed_title"] = sequence.pad_sequences(test_data['processed_title'],\
                    maxlen=self.title_max_len,  padding='post', truncating='post')
                dev_data["processed_title"] = sequence.pad_sequences(dev_data["processed_title"], \
                     maxlen=self.title_max_len, padding='post', truncating='post')
                train_data["processed_title"] = sequence.pad_sequences(train_data["processed_title"],\
                       maxlen=self.title_max_len, padding='post', truncating='post')
                model = self.do_train(model, train_data, dev_data, test_data)
                result = model.predict_classes([
                    test_data["processed_content"],
                    test_data["processed_title"]
                ],
                                               batch_size=self.batch_size,
                                               verbose=1)
                f_measure, pre, recall, acc = dr_evaluate(
                    test_data["label"], result)
                logging.info("***********")
                logging.info("[flod] " + str(flod) + '] test F-measure:' +
                             str(f_measure) + " test acc:" + str(acc))
                logging.info("***********")
                if bestF < f_measure:
                    bestF = f_measure
                    bestAcc = acc
                    bestPre = pre
                    bestRecall = recall
                    model.save_weights('cnn_model.h5')

            model = self.model(embeddings)
            model.compile(loss='binary_crossentropy',
                          metrics=[ut.f_score],
                          optimizer=adam)
            model.load_weights('cnn_model.h5')
            model.save_weights('news_classifier_model.h5')

            logging.info("###")
            logging.info('[** best result **] best F-measure:' +
                         str(f_measure) + " best acc:" + str(acc))
            logging.info("###")
            logging.info("********************************")
        except BaseException, e:
            logging.error(e)
Exemplo n.º 4
0

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='cmod')
    parser.add_argument('-c', '--config', help='Config file path', required=True)
    cfg_parser = configparser.ConfigParser()

    args = parser.parse_args()
    cfg_parser.read(args.config)

    cfg = config.Config(cfg_parser)

    D = DataUtil (cfg)

    train_dataset = D.get_data('train')
    test_dataset = D.get_data('test')
    dev_dataset = D.get_data('dev')

    device = torch.device("cuda:0" if cfg.use_cuda() else "cpu")

    if cfg.sparse() and cfg.weight_decay() != 0:
        cfg.logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()

    torch.manual_seed(cfg.random_seed())
    random.seed(cfg.random_seed())

    if cfg.use_cuda():
        torch.cuda.manual_seed(cfg.random_seed())
        torch.backends.cudnn.benchmark = True
            self.train()

        wordVec = Word2Vector()
        embeddings = np.array(wordVec.embeddings)
        model = self.model(embeddings)
        adam = Adagrad(lr=0.01, epsilon=1e-06)
        model.compile(loss='binary_crossentropy',
                      metrics=[ut.f_score],
                      optimizer=adam)
        model.load_weights('news_classifier_model.h5')
        dataUtil = DataUtil("articles_testN")
        pre_data = dataUtil.filter_data(data, 1)
        pre_data = dataUtil.transfer_form(pre_data)
        pre_data["processed_content"] = sequence.pad_sequences(pre_data['processed_content'],\
               maxlen=self.content_max_len,  padding='post', truncating='post')
        pre_data["processed_title"] = sequence.pad_sequences(pre_data['processed_title'],\
       maxlen=self.title_max_len,  padding='post', truncating='post')
        result = model.predict_classes([pre_data["processed_content"], \
         pre_data["processed_title"]], batch_size=self.batch_size, verbose=1)

        #count = 0
        for i in range(len(data)):
            data[i]["artitle_label"] = result[i][0]
        return data


classifier = Classifier()
dataUtil = DataUtil("articles_testN")
origin_data = dataUtil.get_data()
classifier.predict(origin_data['neg'])