Exemplo n.º 1
0
def read_file(filename,
              w2i,
              t2i,
              c2i,
              max_iter=sys.maxsize,
              processing_word=get_processing_word(lowercase=False)):
    """
    Read in a dataset and turn it into a list of instances.
    Modifies the w2i, t2is and c2i dicts, adding new words/attributes/tags/chars 
    as it sees them.
    """
    instances = []
    vocab_counter = collections.Counter()
    niter = 0
    with codecs.open(filename, "r", "utf-8") as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if line == 'BMES_BREAK' or line.startswith("-DOCSTART-"):
                if len(words) != 0:
                    niter += 1
                    if max_iter is not None and niter > max_iter:
                        break
                    instances.append(Instance(words, tags))
                    words, tags = [], []
            else:
                word, tag = line.split()
                word = processing_word(word)
                vocab_counter[word] += 1
                if word not in w2i:
                    w2i[word] = len(w2i)
                if tag not in t2i:
                    t2i[tag] = len(t2i)
                if is_dataset_tag(word):
                    if word not in c2i:
                        c2i[word] = len(c2i)
                else:
                    for c in word:
                        if c not in c2i:
                            c2i[c] = len(c2i)
                words.append(w2i[word])
                tags.append(t2i[tag])
    return instances, vocab_counter
Exemplo n.º 2
0
    def __init__(self,
                 tagset_size,
                 num_lstm_layers,
                 hidden_dim,
                 word_embeddings,
                 no_we_update,
                 use_char_rnn,
                 char_embeddings,
                 char_hidden_dim,
                 margins,
                 lowercase_words,
                 vocab_size=None,
                 word_embedding_dim=DEFAULT_WORD_EMBEDDING_SIZE,
                 charset_size=None,
                 char_embedding_dim=50,
                 tie_two_embeddings=False,
                 use_we=True):
        self.dropout = None
        self.model = dy.Model()
        self.tagset_size = tagset_size
        self.margins = margins
        self.we_update = not no_we_update
        self.lowercase_words = lowercase_words

        # Word embedding parameters
        self.use_we = use_we
        if use_we:
            if word_embeddings is not None:  # Use pretrained embeddings
                vocab_size = word_embeddings.shape[0]
                word_embedding_dim = word_embeddings.shape[1]
            self.words_lookup = self.model.add_lookup_parameters(
                (vocab_size, word_embedding_dim))
            if word_embeddings is not None:
                self.words_lookup.init_from_array(word_embeddings)
        else:
            self.words_lookup = None

        # bigram embeddings
        if options.bigram:
            self.bigram_lookup = self.model.add_lookup_parameters(
                (len(b2i), word_embedding_dim))
            self.bigram_lookup.init_from_array(bigram_embeddings)

        # Char LSTM Parameters
        self.use_char_rnn = use_char_rnn
        if use_char_rnn:
            if char_embeddings is not None:
                charset_size = char_embeddings.shape[0]
                char_embedding_dim = char_embeddings.shape[1]
            self.char_embedding_dim = char_embedding_dim
            if tie_two_embeddings:
                self.char_lookup = self.words_lookup
            else:
                self.char_lookup = self.model.add_lookup_parameters(
                    (charset_size, self.char_embedding_dim))
                if char_embeddings is not None:
                    self.char_lookup.init_from_array(char_embeddings)
            self.char_bi_lstm = dy.BiRNNBuilder(1, self.char_embedding_dim,
                                                char_hidden_dim, self.model,
                                                dy.LSTMBuilder)

            # Cache char ids for each word for fast speed
            self.word_to_char_ids = dict()
            for word, word_id in w2i.items():
                # Note: use original casing ("word") for characters
                if utils.is_dataset_tag(word):
                    char_ids = [c2i[word]]
                else:
                    char_ids = [c2i[c] for c in word]
                self.word_to_char_ids[word_id] = char_ids

        # Word LSTM parameters
        if use_char_rnn:
            if use_we:
                input_dim = word_embedding_dim + char_hidden_dim
            else:
                input_dim = char_hidden_dim
        else:
            input_dim = word_embedding_dim
        self.bi_lstm = dy.BiRNNBuilder(num_lstm_layers, input_dim, hidden_dim,
                                       self.model, dy.LSTMBuilder)
        # Matrix that maps from Bi-LSTM output to num tags
        if options.bigram:
            self.lstm_to_tags_params = self.model.add_parameters(
                (tagset_size, hidden_dim + word_embedding_dim * 2))
        else:
            self.lstm_to_tags_params = self.model.add_parameters(
                (tagset_size, hidden_dim))
        self.lstm_to_tags_bias = self.model.add_parameters(tagset_size)
        self.mlp_out = self.model.add_parameters((tagset_size, tagset_size))
        self.mlp_out_bias = self.model.add_parameters(tagset_size)

        # Transition matrix for tagging layer, [i,j] is score of transitioning to i from j
        self.transitions = self.model.add_lookup_parameters(
            (tagset_size, tagset_size))
Exemplo n.º 3
0
            for batch_id, batch in enumerate(
                    utils.minibatches(dev_instances, dev_batch_size)):
                for idx, instance in enumerate(batch):
                    sentence = instance.sentence
                    if len(sentence) == 0: continue

                    gold_tags = instance.tags
                    losses = model.neg_log_loss(sentence, gold_tags)
                    total_loss = losses.scalar_value()
                    _, out_tags = model.viterbi_loss(sentence,
                                                     gold_tags,
                                                     use_margins=False)

                    sentence = utils.restore_sentence(sentence)
                    dataset_name = None
                    if utils.is_dataset_tag(i2w[sentence[0]]):
                        dataset_name = i2w[sentence[0]][1:-1]
                        if dataset_name not in prf_dataset:
                            prf_dataset[dataset_name] = utils.CWSEvaluator(t2i)
                        sentence = sentence[1:-1]
                        gold_tags = gold_tags[1:-1]
                        out_tags = out_tags[1:-1]
                        prf_dataset[dataset_name].add_instance(
                            gold_tags, out_tags)

                    prf.add_instance(gold_tags, out_tags)

                    gold_strings = utils.to_tag_strings(i2t, gold_tags)
                    obs_strings = utils.to_tag_strings(i2t, out_tags)

                    dev_total_instance += 1
Exemplo n.º 4
0
def tester(model, test_batch, write_out=False):
    res = []
    prf = utils.CWSEvaluator(i2t)
    prf_dataset = {}
    oov_dataset = {}

    model.eval()
    for batch_x, batch_y in test_batch:
        with torch.no_grad():
            if bigram_embedding is not None:
                out = model(batch_x["task"], batch_x["uni"],
                            batch_x["seq_len"], batch_x["bi1"], batch_x["bi2"])
            else:
                out = model(batch_x["task"], batch_x["uni"],
                            batch_x["seq_len"])
        out = out["pred"]
        #print(out)
        num = out.size(0)
        out = out.detach().cpu().numpy()
        for i in range(num):
            length = int(batch_x["seq_len"][i])

            out_tags = out[i, 1:length].tolist()
            sentence = batch_x["ori_words"][i]
            gold_tags = batch_y["tags"][i][1:length].numpy().tolist()
            dataset_name = sentence[0]
            sentence = sentence[1:]
            #print(out_tags,gold_tags)
            assert utils.is_dataset_tag(dataset_name)
            assert len(gold_tags) == len(out_tags) and len(gold_tags) == len(
                sentence)

            if dataset_name not in prf_dataset:
                prf_dataset[dataset_name] = utils.CWSEvaluator(i2t)
                oov_dataset[dataset_name] = utils.CWS_OOV(
                    word_dic[dataset_name[1:-1]])

            prf_dataset[dataset_name].add_instance(gold_tags, out_tags)
            prf.add_instance(gold_tags, out_tags)

            if write_out == True:
                gold_strings = utils.to_tag_strings(i2t, gold_tags)
                obs_strings = utils.to_tag_strings(i2t, out_tags)

                word_list = utils.bmes_to_words(sentence, obs_strings)
                oov_dataset[dataset_name].update(
                    utils.bmes_to_words(sentence, gold_strings), word_list)

                raw_string = ' '.join(word_list)
                res.append(dataset_name + " " + raw_string + " " +
                           dataset_name)

    Ap = 0.0
    Ar = 0.0
    Af = 0.0
    Aoov = 0.0
    tot = 0
    nw = 0.0
    for dataset_name, performance in sorted(prf_dataset.items()):
        p = performance.result()
        if write_out == True:
            nw = oov_dataset[dataset_name].oov()
            logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format(
                dataset_name, p[0], p[1], p[2], nw))
        else:
            logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format(
                dataset_name, p[0], p[1], p[2]))
        Ap += p[0]
        Ar += p[1]
        Af += p[2]
        Aoov += nw
        tot += 1

    prf = prf.result()
    logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format(
        'TOT', prf[0], prf[1], prf[2]))
    if write_out == False:
        logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format(
            'AVG', Ap / tot, Ar / tot, Af / tot))
    else:
        logger.info('{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}'.format(
            'AVG', Ap / tot, Ar / tot, Af / tot, Aoov / tot))
    return prf[-1], res