示例#1
0
def get_predictions(model, data_reader, data_path):
    predictor = TextClassifierPredictor(model=model,
                                        dataset_reader=data_reader)
    data = list(data_reader.read(data_path))

    size = len(data)
    bound = 4000
    preds = []

    if size > bound:
        times = int(size / bound)
        print(f"Set is too big; total size: {size}. "
              f"Batching {times} times.")

        for i in range(times):
            print(f"Lower: {bound*i}, Upper: {bound*(i + 1)}")
            preds += predictor.predict_batch_instance(data[bound * i:bound *
                                                           (i + 1)])

        if (size - (bound * times)) > 0:
            print(f"Lower: {bound*times}, Upper: {size}")
            preds += predictor.predict_batch_instance(data[bound * times:])
    else:
        preds = predictor.predict_batch_instance(data)

    labelmap = predictor._model.vocab.get_index_to_token_vocabulary('labels')

    predictions = [labelmap[np.argmax(lst['probs'])] for lst in preds]
    actuals = [str(i['label'].label) for i in data]
    labels = list(labelmap.values())

    return actuals, predictions, labels
示例#2
0
 def __init__(self):
     self.root_path = '/home/cym/jwtech_sci_bert'
     self.sentence_predictor = SentenceTaggerPredictor.from_path(
         os.path.join(self.root_path, 'modelsave_ner/model.tar.gz'))
     self.realtion_predictor = TextClassifierPredictor.from_path(
         os.path.join(self.root_path, 'modelsave_rel/model.tar.gz'),
         predictor_name='text_classifier')
示例#3
0
class ClassifierPredictor:
    def __init__(self, model: Classifier) -> None:
        self.model = model
        self.reader = ClassificationReader(skip_start_end=True)
        self.predictor = TextClassifierPredictor(self.model, self.reader)

    def predict(self, sequences: List[str]) -> np.ndarray:
        probs = [self.predictor.predict(seq)['probs'] for seq in sequences]
        probs = np.array(probs)
        return probs
    def test_interpret_fails_when_embedding_layer_not_found(self):
        inputs = {"sentence": "It was the ending that I hated"}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace([w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = TextClassifierPredictor(model, TextClassificationJsonReader())

        interpreter = SmoothGradient(predictor)
        with raises(RuntimeError):
            interpreter.saliency_interpret_from_json(inputs)
示例#5
0
    def test_interpret_fails_when_embedding_layer_not_found(self):
        inputs = {"sentence": "I always write unit tests for my code."}
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            [w for w in inputs["sentence"].split(" ")])
        model = FakeModelForTestingInterpret(
            vocab, max_tokens=len(inputs["sentence"].split(" ")))
        predictor = TextClassifierPredictor(model,
                                            TextClassificationJsonReader())

        hotflipper = Hotflip(predictor)
        with raises(RuntimeError):
            hotflipper.initialize()
    # Simple LSTM
    if simple_lstm:
        EMBEDDING_DIM = 128
        HIDDEN_DIM = 128
        reader = StanfordSentimentTreeBankDatasetReader()
        train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
        dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')
        test_dataset = reader.read('data/stanfordSentimentTreebank/trees/test.txt')
        vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
        lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
        model = LstmClassifier(word_embeddings, lstm, vocab)
        with open("models/simple_LSTM_sentiment_classifier.th", 'rb') as f:
            model.load_state_dict(torch.load(f))
        predictor = TextClassifierPredictor(model, dataset_reader=reader)
        test_results = predictor.predict_batch_instance(test_dataset)

    # ELMo LSTM
    if elmo_lstm:
        elmo_embedding_dim = 256
        HIDDEN_DIM = 128
        elmo_token_indexer = ELMoTokenCharactersIndexer()
        reader = StanfordSentimentTreeBankDatasetReader(token_indexers={'tokens': elmo_token_indexer})
        train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
        dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')
        test_dataset = reader.read('data/stanfordSentimentTreebank/trees/test.txt')
        vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})
        options_file = 'data/elmo/elmo_2x1024_128_2048cnn_1xhighway_options.json'
        weight_file = 'data/elmo/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
        elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
# Training
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_data,
    validation_dataset=validation_data,
    patience=2,  # early stopping if it is stuck for 2 epochs
    num_epochs=
    10,  # we should increase the number of epoch later, it's just for early try
    cuda_device=cuda_device)
trainer.train()

# predictor
pre_example = 'Good morning'
predictor = TextClassifierPredictor()
pre = predictor.predict(pre_example)

# save the model
with open('./tmp/classifier_biattention_model.th', 'wb') as f:
    torch.save(model.state_dict(), f)

# save the vocabulary
vocab.save_to_files('./tmp/vocabulary')

# reload the model
# vocab2 = Vocabulary.from_files('./tmp/vocabulary')
# model2 = BiattentiveClassificationNetwork(word_embeddings, encoder, vocab2)
# with open('./tmp/classifier_biattention_model.th', 'rb') as f:
#     model2.load_state_dict(torch.load(f))
# if cuda_device > -1:
示例#8
0
 def __init__(self, model: Classifier) -> None:
     self.model = model
     self.reader = ClassificationReader(skip_start_end=True)
     self.predictor = TextClassifierPredictor(self.model, self.reader)
示例#9
0
def _get_classifier_from_args(vocab: Vocabulary, path: str):
    with open(path) as file:
        args = json.load(file)
    num_classes = args['num_classes']
    return get_classification_model(vocab, int(num_classes))


if __name__ == '__main__':
    args = parser.parse_args()

    class_reader = ClassificationReader(skip_start_end=True)
    class_vocab = Vocabulary.from_files(Path(args.classifier_path) / 'vocab')
    class_model = _get_classifier_from_args(class_vocab, Path(args.classifier_path) / 'args.json')
    load_weights(class_model, Path(args.classifier_path) / 'best.th')

    predictor = TextClassifierPredictor(class_model, class_reader)
    max_tokens = args.max_tokens or class_vocab.get_vocab_size('tokens')
    attacker = HotFlipFixed(predictor, max_tokens=max_tokens)
    attacker.initialize()

    data = pd.read_csv(args.csv_path)
    sequences = data['sequences'].tolist()[:args.sample]
    labels = data['labels'].tolist()[:args.sample]

    results_path = Path(args.results_path) / datetime.now().strftime('%Y%m%d_%H%M%S')
    results_path.mkdir(exist_ok=True, parents=True)
    path_to_results_file = results_path / 'results.csv'
    dump_metrics(results_path / 'args.json', args.__dict__)
    with open(path_to_results_file, 'w', newline='') as csv_write:
        fieldnames = list(AttackerOutput.__annotations__.keys())
        writer = csv.DictWriter(csv_write, fieldnames=fieldnames)