Пример #1
0
    default=0,
    help='write the test result, set 1 for writing result to file')
parser.add_argument('--set', type=int, default=0, help='test set or valid set')
parser.add_argument('--flag',
                    default="PER",
                    help='entity type (PER/LOC/ORG/MISC)')
parser.add_argument('--lr_rate', type=int, default=1e-4, help='learning rate')
parser.add_argument('--dataset',
                    default="conll2003",
                    help='name of the dataset')
parser.add_argument('--type',
                    default="bnpu",
                    help='pu learning type (bnpu/bpu/upu)')
args = parser.parse_args()

dp = DataPrepare(args.dataset)
mutils = FeaturedDetectionModelUtils(dp)

trainSet, validSet, testSet, prior = mutils.load_dataset(
    args.flag, args.dataset, 1.0)
setIter = [testSet, validSet]
detectionSet = setIter[args.set]
fileNameIter = [
    "data/" + args.dataset + "/test.txt", "data/" + args.dataset + "/valid.txt"
]
fileName = fileNameIter[args.set]

charcnn = CharCNN(dp.char2Idx)
wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx)
casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx)
featurenet = FeatureNet()
Пример #2
0
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--beta', type=float, default=0.0)
    parser.add_argument('--gamma', type=float, default=1.0)
    parser.add_argument('--drop_out', type=float, default=0.5)
    parser.add_argument('--m', type=float, default=0.3)
    parser.add_argument('--p', type=float, default=0.046)
    parser.add_argument('--flag', default="PER")
    parser.add_argument('--dataset', default="conll2003")
    parser.add_argument('--unlabel_size', type=int, default=10000)
    parser.add_argument('--iter', type=int, default=1)

    parser.add_argument('--model', default="")

    args = parser.parse_args()

    dp = DataPrepare(args.dataset)
    mutils = AddUnlabeledModelUtils(dp)
    # use for training
    dataset = mutils.load_dataset_(args.flag, args.dataset, args.iter)
    trainSet, validSet, testSet, prior = mutils.load_origin_dataset(
        dataset, args.p)

    train_sentences_X, train_sentences_Y, train_sentences_LF = zip(*trainSet)

    trainSize = len(trainSet)
    validSize = len(validSet)
    testSize = len(testSet)
    print(("train set size: {}, valid set size: {}, test set size: {}").format(
        trainSize, validSize, testSize))

    charcnn = CharCNN(dp.char2Idx)
Пример #3
0
        os.makedirs(fdir)
    fname = os.path.join(fdir, "train.txt")
    allSentences = []
    for sent in trainSentences:
        allSentences.append(sent)
    for sent in unlabeledSentences:
        allSentences.append(sent)
    shuffle_indices = np.random.permutation(np.arange(len(allSentences)))
    allSentences = np.array(allSentences)[shuffle_indices]
    with open(fname, "w") as fw:
        for sentence in allSentences:
            for word, label, tagIdxList in sentence:
                labeled = 0
                fw.write(word + " " + str(label) + " " + str(labeled) + "\n")
            fw.write("\n")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="PU NER")
    # data

    parser.add_argument('--dataset', default="conll2003")
    args = parser.parse_args()

    dp = DataPrepare(args.dataset)
    make_unlabeled_data(dp, "data/" + args.dataset + "/train.txt",
                        "data/eng.raw", args.dataset)
    print("Data Set Writing Successfully")
def main_feature():
    parser = argparse.ArgumentParser(description="PU NER")
    # data
    parser.add_argument('--lr', type=float, default=1e-4, help='learning rate')
    parser.add_argument('--beta',
                        type=float,
                        default=0.0,
                        help='beta of pu learning (default 0.0)')
    parser.add_argument('--gamma',
                        type=float,
                        default=1.0,
                        help='gamma of pu learning (default 1.0)')
    parser.add_argument('--drop_out',
                        type=float,
                        default=0.5,
                        help='dropout rate')
    parser.add_argument('--m',
                        type=float,
                        default=0.3,
                        help='class balance rate')
    parser.add_argument('--flag',
                        default="Disease",
                        help='entity type (PER/LOC/ORG/MISC)')
    parser.add_argument('--dataset',
                        default="disease",
                        help='name of the dataset')
    parser.add_argument('--batch_size',
                        type=int,
                        default=100,
                        help='batch size for training and testing')
    parser.add_argument('--print_time',
                        type=int,
                        default=1,
                        help='epochs for printing result')
    parser.add_argument('--pert',
                        type=float,
                        default=1.0,
                        help='percentage of data use for training')
    parser.add_argument('--type',
                        type=str,
                        default='bnpu',
                        help='pu learning type (bnpu/bpu/upu)')  # bpu upu
    parser.add_argument('--num', type=str, default='0', help='file number')
    args = parser.parse_args()

    dp = DataPrepare(args.dataset)
    mutils = FeaturedDetectionModelUtils(dp)

    trainSet, validSet, testSet, prior = mutils.load_dataset(
        args.flag, args.dataset, args.pert, args.num)
    f1_best_train = 0
    trainSize = len(trainSet)
    validSize = len(validSet)
    testSize = len(testSet)
    print(("train set size: {}, valid set size: {}, test set size: {}").format(
        trainSize, validSize, testSize))

    charcnn = CharCNN(dp.char2Idx)
    wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx)
    casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx)
    featurenet = FeatureNet()
    pulstmcnn = PULSTMCNN(dp, charcnn, wordnet, casenet, featurenet, 150, 200,
                          1, args.drop_out)

    if torch.cuda.is_available:
        charcnn.cuda()
        wordnet.cuda()
        casenet.cuda()
        featurenet.cuda()
        pulstmcnn.cuda()
        torch.cuda.manual_seed(1013)

    trainer = Trainer(pulstmcnn, prior, args.beta, args.gamma, args.lr, args.m)

    time = 0

    bar = ProgressBar(maxval=int((len(trainSet) - 1) / args.batch_size))

    train_sentences = dp.read_origin_file("data/" + args.dataset +
                                          "/train.txt")
    trainSize = int(len(train_sentences) * args.pert)
    train_sentences = train_sentences[:trainSize]
    train_words = []
    train_efs = []
    for s in train_sentences:
        temp = []
        temp2 = []
        for word, ef, lf in s:
            temp.append(word)
            temp2.append(ef)
        train_words.append(temp)
        train_efs.append(temp2)

    valid_sentences = dp.read_origin_file("data/" + args.dataset +
                                          "/valid.txt")
    valid_words = []
    valid_efs = []
    for s in valid_sentences:
        temp = []
        temp2 = []
        for word, ef, lf in s:
            temp.append(word)
            temp2.append(ef)
        valid_words.append(temp)
        valid_efs.append(temp2)

    test_sentences = dp.read_origin_file("data/" + args.dataset + "/test.txt")
    test_words = []
    test_efs = []
    for s in test_sentences:
        temp = []
        temp2 = []
        for word, ef, lf in s:
            temp.append(word)
            temp2.append(ef)
        test_words.append(temp)
        test_efs.append(temp2)

    for e in range(1, 1000):
        print("Epoch: {}".format(e))
        bar.start()
        risks = []
        prisks = []
        nrisks = []
        for step, (x_word_batch, x_case_batch, x_char_batch, x_feature_batch,
                   y_batch, flag_batch) in enumerate(
                       mutils.iterateSet(trainSet,
                                         batchSize=args.batch_size,
                                         mode="TRAIN")):
            bar.update(step)
            batch = [
                x_word_batch, x_case_batch, x_char_batch, x_feature_batch,
                y_batch, flag_batch
            ]
            acc, risk, prisk, nrisk = trainer.train_mini_batch(batch, args)
            risks.append(risk)
            prisks.append(prisk)
            nrisks.append(nrisk)
        meanRisk = np.mean(np.array(risks))
        meanRisk2 = np.mean(np.array(prisks))
        meanRisk3 = np.mean(np.array(nrisks))
        print("risk: {}, prisk: {}, nrisk: {}".format(meanRisk, meanRisk2,
                                                      meanRisk3))

        if e % 5 == 0:
            trainer.decay_learning_rate(e, args.lr)
        if e % args.print_time == 0:
            # train set
            pred_train = []
            corr_train = []
            for step, (x_word_train_batch, x_case_train_batch,
                       x_char_train_batch, x_feature_train_batch,
                       y_train_batch) in enumerate(
                           mutils.iterateSet(trainSet,
                                             batchSize=100,
                                             mode="TEST",
                                             shuffle=False)):
                trainBatch = [
                    x_word_train_batch, x_case_train_batch, x_char_train_batch,
                    x_feature_train_batch
                ]
                correcLabels = []
                for x in y_train_batch:
                    for xi in x:
                        correcLabels.append(xi)
                lengths = [len(x) for x in x_word_train_batch]
                predLabels, _ = trainer.test(trainBatch, lengths)
                correcLabels = np.array(correcLabels)
                # print(predLabels)
                # print(correcLabels)
                assert len(predLabels) == len(correcLabels)
                start = 0
                for i, l in enumerate(lengths):
                    end = start + l
                    p = predLabels[start:end]
                    c = correcLabels[start:end]
                    pred_train.append(p)
                    corr_train.append(c)
                    start = end

            # prec, rec, f1 = dp.evaluate_ner_tagging("data/" + args.dataset + "/train.txt",pred_train)

            newSentences = []
            for i, s in enumerate(train_words):
                sent = []
                assert len(s) == len(train_efs[i]) == len(pred_train[i])
                for j, item in enumerate(s):
                    sent.append([item, train_efs[i][j], pred_train[i][j]])
                newSentences.append(sent)

            newSentences_, newLabels, newPreds = dp.wordLevelGeneration(
                newSentences)
            print("newlabels : ", len(newLabels))
            print("newpreds : ", len(newPreds))
            print(newLabels[0])
            p_train, r_train, f1_train = dp.compute_precision_recall_f1(
                newLabels, newPreds, args.flag, 1)
            print("Precision: {}, Recall: {}, F1: {}".format(
                p_train, r_train, f1_train))
            print(trainer.bestResult)
            if f1_train > f1_best_train:
                f1_best_train = f1_train
                p_best_train = p_train
                r_best_train = f1_train
                p_train_partial, r_train_partial, f1_train_partial, partial_entity_train, partial_ratio_train = dp.compute_precision_recall_f1_partial(
                    newLabels, newPreds, args.flag, 1, True)

            # valid set
            pred_valid = []
            corr_valid = []
            for step, (x_word_test_batch, x_case_test_batch, x_char_test_batch,
                       x_feature_test_batch, y_test_batch) in enumerate(
                           mutils.iterateSet(validSet,
                                             batchSize=100,
                                             mode="TEST",
                                             shuffle=False)):
                validBatch = [
                    x_word_test_batch, x_case_test_batch, x_char_test_batch,
                    x_feature_test_batch
                ]
                correcLabels = []
                for x in y_test_batch:
                    for xi in x:
                        correcLabels.append(xi)
                lengths = [len(x) for x in x_word_test_batch]
                predLabels, _ = trainer.test(validBatch, lengths)
                correcLabels = np.array(correcLabels)
                assert len(predLabels) == len(correcLabels)

                start = 0
                for i, l in enumerate(lengths):
                    end = start + l
                    p = predLabels[start:end]
                    c = correcLabels[start:end]
                    pred_valid.append(p)
                    corr_valid.append(c)
                    start = end

            newSentencesValid = []
            for i, s in enumerate(valid_words):
                sent = []
                assert len(s) == len(valid_efs[i]) == len(pred_valid[i])
                for j, item in enumerate(s):
                    sent.append([item, valid_efs[i][j], pred_valid[i][j]])
                newSentencesValid.append(sent)

            newSentencesValid_, newLabelsValid, newPredsValid = dp.wordLevelGeneration(
                newSentencesValid)
            p_valid, r_valid, f1_valid = dp.compute_precision_recall_f1(
                newLabelsValid, newPredsValid, args.flag, 1)
            print("Precision: {}, Recall: {}, F1: {}".format(
                p_valid, r_valid, f1_valid))

            if f1_valid <= trainer.bestResult:
                time += 1
            else:
                trainer.bestResult = f1_valid
                time = 0
                trainer.bestResult = f1_valid
                best_precision_valid = p_valid
                best_rappel_valid = r_valid
                p_valid_partial, r_valid_partial, f1_valid_partial, partial_entity_valid, partial_ratio_valid = dp.compute_precision_recall_f1_partial(
                    newLabelsValid, newPredsValid, args.flag, 1, True)
                trainer.save((
                    "saved_model/{}_{}_{}_{}_lr_{}_prior_{}_beta_{}_gamma_{}_percent_{}"
                ).format(args.num, args.type, args.dataset, args.flag,
                         trainer.learningRate, trainer.m, trainer.beta,
                         trainer.gamma, args.pert))
            if time > 5:
                f_result_valid = open('result_val.txt', 'a')
                print(("BEST RESULT ON VALIDATE DATA:{}").format(
                    trainer.bestResult),
                      file=f_result_valid)
                break

    pulstmcnn.load_state_dict(
        torch.load(
            "saved_model/{}_{}_{}_{}_lr_{}_prior_{}_beta_{}_gamma_{}_percent_{}"
            .format(args.num, args.type, args.dataset, args.flag,
                    trainer.learningRate, trainer.m, trainer.beta,
                    trainer.gamma, args.pert)))

    pred_test = []
    corr_test = []
    for step, (x_word_test_batch, x_case_test_batch, x_char_test_batch,
               x_feature_test_batch, y_test_batch) in enumerate(
                   mutils.iterateSet(testSet,
                                     batchSize=100,
                                     mode="TEST",
                                     shuffle=False)):
        testBatch = [
            x_word_test_batch, x_case_test_batch, x_char_test_batch,
            x_feature_test_batch
        ]
        correcLabels = []
        for x in y_test_batch:
            for xi in x:
                correcLabels.append(xi)
        lengths = [len(x) for x in x_word_test_batch]
        predLabels, _ = trainer.test(testBatch, lengths)
        correcLabels = np.array(correcLabels)
        assert len(predLabels) == len(correcLabels)
        start = 0
        for i, l in enumerate(lengths):
            end = start + l
            p = predLabels[start:end]
            c = correcLabels[start:end]
            pred_test.append(p)
            corr_test.append(c)
            start = end

    newSentencesTest = []
    for i, s in enumerate(test_words):
        sent = []
        assert len(s) == len(test_efs[i]) == len(pred_test[i])
        for j, item in enumerate(s):
            sent.append([item, test_efs[i][j], pred_test[i][j]])
        newSentencesTest.append(sent)

    newSentencesValid_, newLabelsValid, newPredsValid = dp.wordLevelGeneration(
        newSentencesTest)
    p_valid, r_valid, f1_valid = dp.compute_precision_recall_f1(
        newLabelsValid, newPredsValid, args.flag, 1)
    p_test_partial, r_test_partial, f1_test_partial, partial_entity_test, partial_ratio_test = dp.compute_precision_recall_f1_partial(
        newLabelsValid, newPredsValid, args.flag, 1, True)
    dutils = utils.dict_utils.DictUtils()
    inDico = get_dico(dp, dutils, "data/disease/test.txt", "fulldico.txt",
                      "Disease", "Train", "disease")
    p_test_gene, r_test_gene, f1_test_gene, partial_test_gene, rpartial_test_gene, f1partial_test_gene, nb_notindico = dp.compute_metrics_generalize(
        newLabelsValid, newPredsValid, args.flag, 1, inDico)

    result_file = "results/" + args.num + "_final_results.txt"
    with open(result_file, "a+") as fw:
        print(args.lr, args.beta, args.gamma, args.drop_out, args.m, file=fw)
        print(p_best_train, r_best_train, f1_best_train, file=fw)
        print(p_train_partial,
              r_train_partial,
              f1_train_partial,
              partial_entity_train,
              partial_ratio_train,
              file=fw)

        print(best_precision_valid,
              best_rappel_valid,
              trainer.bestResult,
              file=fw)
        print(p_valid_partial,
              r_valid_partial,
              f1_valid_partial,
              partial_entity_valid,
              partial_ratio_valid,
              file=fw)

        print(p_valid, r_valid, f1_valid, file=fw)
        print(p_test_partial,
              r_test_partial,
              f1_test_partial,
              partial_entity_test,
              partial_ratio_test,
              file=fw)

        print(p_test_gene, r_test_gene, f1_test_gene, file=fw)
        print(partial_test_gene,
              rpartial_test_gene,
              f1partial_test_gene,
              file=fw)
        print(nb_notindico, file=fw)
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="PU NER EVL")
    parser.add_argument('--dataset', default="conll2003")
    parser.add_argument('--type', default="bnpu")
    args = parser.parse_args()

    filenames = [
        'result/' + args.type + '_feature_pu_' + args.dataset + '_PER_0.txt',
        'result/' + args.type + '_feature_pu_' + args.dataset + '_LOC_0.txt',
        'result/' + args.type + '_feature_pu_' + args.dataset + '_ORG_0.txt',
        'result/' + args.type + '_feature_pu_' + args.dataset + '_MISC_0.txt'
    ]

    origin_file = "data/" + args.dataset + "/test.txt"
    dp = DataPrepare(args.dataset)

    test_sentences = dp.read_origin_file(origin_file)
    test_words = []
    test_efs = []
    lens = []
    for s in test_sentences:
        temp = []
        temp2 = []
        for word, ef, lf in s:
            temp.append(word)
            temp2.append(ef)
        test_words.append(temp)
        test_efs.append(temp2)
        lens.append(len(s))
Пример #6
0
from sub_model import CharCNN, CaseNet, WordNet
import numpy as np

torch.manual_seed(10)

parser = argparse.ArgumentParser(description="PU NER")
parser.add_argument('--model', default="")
parser.add_argument('--output', default=0)
parser.add_argument('--set', type=int, default=0)
parser.add_argument('--flag', default="PER")
parser.add_argument('--lr_rate', type=int, default=1e-4)
parser.add_argument('--dataset', default="conll2003")
parser.add_argument('--pert', type=float, default=1.0)
args = parser.parse_args()

dp = DataPrepare(args.dataset)
mutils = SupervisedModelUtils(dp)

trainSet, validSet, testSet = mutils.load_dataset(args.flag, args.dataset,
                                                  args.pert)
setIter = [testSet, validSet]
detectionSet = setIter[args.set]
fileNameIter = [
    "data/" + args.dataset + "/test.txt", "data/" + args.dataset + "/valid.txt"
]
fileName = fileNameIter[args.set]

charcnn = CharCNN(dp.char2Idx)
wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx)
casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx)
Пример #7
0
if __name__ == "__main__":
    from utils.add_unlabeled_data_model_utils import AddUnlabeledModelUtils
    from utils.data_utils import DataPrepare
    dp = DataPrepare("conll2003")
    temp = AddUnlabeledModelUtils(dp)
    temp.load_dataset_("PER", "conll2003", 10000)
    temp.load_dataset_("PER", "conll2003", 20000)
Пример #8
0
    parser.add_argument('--gamma', type=float, default=1.0, help='gamma of pu learning (default 1.0)')
    parser.add_argument('--drop_out', type=float, default=0.5, help='dropout rate')
    parser.add_argument('--m', type=float, default=1.5, help='class balance rate')
    parser.add_argument('--p', type=float, default=0.1, help='estimate value of prior')
    parser.add_argument('--flag', default="Disease", help='entity type (PER/LOC/ORG/MISC)')
    parser.add_argument('--dataset', default="disease", help='name of the dataset')
    parser.add_argument('--lr', type=float, default=1e-4, help='learning rate')
    parser.add_argument('--batch_size', type=int, default=100, help='batch size for training and testing')
    parser.add_argument('--output', default=0, help='write the test result, set 1 for writing result to file')
    parser.add_argument('--num', type=str, default='', help='file number')
    parser.add_argument('--model', default="", help='saved model name')
    parser.add_argument('--iter', type=int, default=1, help='iteration time')

    args = parser.parse_args()

    dp = DataPrepare(args.dataset)
    mutils = AdaptivePUUtils(dp)
    dutils = DictUtils()

    trainSet, validSet, testSet, prior = mutils.load_new_dataset(args.flag, args.dataset, args.iter, args.p, args.num)
    print(prior)
    trainSize = len(trainSet)
    validSize = len(validSet)
    testSize = len(testSet)
    print(("train set size: {}, valid set size: {}, test set size: {}").format(trainSize, validSize, testSize))

    charcnn = CharCNN(dp.char2Idx)
    wordnet = WordNet(dp.wordEmbeddings, dp.word2Idx)
    casenet = CaseNet(dp.caseEmbeddings, dp.case2Idx)
    featurenet = FeatureNet()
    pulstmcnn = AdaPULSTMCNN2(dp, charcnn, wordnet, casenet, featurenet, 150, 200, 1, args.drop_out)
Пример #9
0
        for word, ef, lf in sentence:
            alltokens += 1
            if ef == 1:
                labeledtokens += 1
            else:
                unlabeledtokens += 1

    prior = float(labeledtokens / alltokens)
    assert alltokens == labeledtokens + unlabeledtokens
    print(prior)


if __name__ == "__main__":
    dataset = "muc"
    flag = "PER"
    dp = DataPrepare(dataset)
    filename = "data/" + dataset + "/train." + flag + ".txt"
    sentences = dp.read_processed_file(filename, flag)
    compute_all_prior(sentences)
    flag = "LOC"
    filename = "data/" + dataset + "/train." + flag + ".txt"
    sentences = dp.read_processed_file(filename, flag)
    compute_all_prior(sentences)
    flag = "ORG"
    filename = "data/" + dataset + "/train." + flag + ".txt"
    sentences = dp.read_processed_file(filename, flag)
    compute_all_prior(sentences)
    # flag = "MISC"
    # filename = "data/" + dataset + "/train." + flag + ".txt"
    # sentences = dp.read_processed_file(filename, flag)
    # compute_all_prior(sentences)