def apply_model(model,
                outfilename,
                word_to_ix,
                all_tags=None,
                trainfile=TRAIN_FILE,
                testfile=DEV_FILE):
    """
    applies the model on the data and writes the best sequence of tags to the outfile
    """
    if all_tags is None:
        all_tags = set()

        # this is slow
        for i, (words,
                tags) in enumerate(preproc.conll_seq_generator(trainfile)):
            for tag in tags:
                all_tags.add(tag)

    with open(outfilename, 'w') as outfile:
        for words, _ in preproc.conll_seq_generator(testfile):
            seq_words = bilstm.prepare_sequence(words, word_to_ix)
            pred_tags = model.predict(seq_words)
            for i, tag in enumerate(pred_tags):
                outfile.write(tag + '\n')
            outfile.write('\n')
def setup():
    global word_to_ix, tag_to_ix, X_tr, Y_tr, model
    
    vocab, word_to_ix = most_common.get_word_to_ix(TRAIN_FILE, max_size=6900)
    tag_to_ix={}
    for i,(words,tags) in enumerate(preproc.conll_seq_generator(TRAIN_FILE)):
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    
    
    if START_TAG not in tag_to_ix:
        tag_to_ix[START_TAG]=len(tag_to_ix)
    if END_TAG not in tag_to_ix:
        tag_to_ix[END_TAG]=len(tag_to_ix)
        
    X_tr = []
    Y_tr = []
    for i,(words,tags) in enumerate(preproc.conll_seq_generator(TRAIN_FILE)):
        X_tr.append(words)
        Y_tr.append(tags)
    
    torch.manual_seed(711);
    
    embedding_dim=30
    hidden_dim=30
    model = bilstm.BiLSTM_CRF(len(word_to_ix),tag_to_ix,embedding_dim, hidden_dim)
Exemplo n.º 3
0
def apply_tagger(tagger,outfilename,all_tags=None,trainfile=TRAIN_FILE,testfile=DEV_FILE):
    if all_tags is None:
        all_tags = set()
        
        # this is slow
        for i,(words, tags) in enumerate(preproc.conll_seq_generator(trainfile)):
            for tag in tags:
                all_tags.add(tag)
            
    with open(outfilename,'w') as outfile:
        for words,_ in preproc.conll_seq_generator(testfile):
            pred_tags = tagger(words,all_tags)
            for i,tag in enumerate(pred_tags):
                outfile.write(tag+'\n')
            outfile.write('\n')
Exemplo n.º 4
0
def get_most_common_word_weights(trainfile):
    """
    Return a set of weights, so that each word is tagged by its most frequent tag in the training file.
    If the word does not appear in the training data,
    the weights should be set so that the tagger outputs the most common tag in the training data.
    For the out of vocabulary words, you need to think on how to set the weights so that you tag them by the most common tag.
    
    Parameters:
    trainfile: -- training file
    :returns: -- classification weights
    :rtype: -- defaultdict

    """
    all_counters = defaultdict(lambda: Counter())
    tag_counter = Counter()
    for (words, tags) in conll_seq_generator(trainfile):
        for word, tag in zip(words, tags):
            all_counters[word][tag] += 1
            tag_counter[tag] += 1

    temp = {}
    for word in all_counters.keys():
        c = all_counters[word].most_common(1)[0][0]
        #print(c)
        temp[(c, word)] = 1

        #my_weights[word] = temp

    t = tag_counter.most_common(1)[0][0]
    temp[(t, OFFSET)] = 0.5
    weights = defaultdict(float, temp)
    print(weights)

    return weights
Exemplo n.º 5
0
def get_most_common_word_weights(trainfile):
    """
    Return a set of weights, so that each word is tagged by its most frequent tag in the training file.
    If the word does not appear in the training data,
    the weights should be set so that the tagger outputs the most common tag in the training data.
    For the out of vocabulary words, you need to think on how to set the weights so that you tag them by the most common tag.
    
    Parameters:
    trainfile: -- training file
    :returns: -- classification weights
    :rtype: -- defaultdict

    """
    weights = defaultdict(float)

    #from 2.1
    all_counters = defaultdict(lambda: Counter())

    for i, (words, tags) in enumerate(conll_seq_generator(trainfile)):
        for j in range(len(words)):
            counter = all_counters.get(words[j])
            if counter is None:
                counter = Counter()

            counter[tags[j]] += 1
            all_counters[words[j]] = counter

    for key in all_counters:
        for label, count in all_counters[key].items():
            weights[(label, key)] = count

    weights[('NOUN', OFFSET)] = 0.1

    return weights
Exemplo n.º 6
0
def get_tag_word_counts(trainfile):
    """
    Produce a Counter of occurences of word for each tag
    
    Parameters:
    trainfile: -- the filename to be passed as argument to conll_seq_generator
    :returns: -- a default dict of counters, where the keys are tags.
    """
    all_counters = defaultdict(lambda: Counter())
    all_tags = set([])
    for (words, tags) in conll_seq_generator(trainfile):
        for tag in tags:
            all_tags.add(tag)

    for tag in all_tags:
        myCounter = Counter()
        for (words, tk) in conll_seq_generator(trainfile):
            for i in range(len(words)):
                if tk[i] == tag:
                    myCounter[words[i]] += 1
        #print(myCounter)
        all_counters[tag] = myCounter
    #print(all_counters)
    return all_counters
Exemplo n.º 7
0
def get_tag_trans_counts(trainfile):
    """compute a dict of counters for tag transitions

    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict

    """
    total_counts = defaultdict(lambda: Counter())
    for (words, tags) in conll_seq_generator(trainfile):
        tags = [START_TAG] + tags + [END_TAG]
        for i in range(len(tags) - 1):
            total_counts[tags[i]][tags[i + 1]] += 1

    return dict(total_counts)
Exemplo n.º 8
0
def get_nb_weights(trainfile, smoothing):
    """
    estimate_nb function assumes that the labels are one for each document, where as in POS tagging: we have labels for 
    each particular token. So, in order to calculate the emission score weights: P(w|y) for a particular word and a 
    token, we slightly modify the input such that we consider each token and its tag to be a document and a label. 
    The following helper code converts the dataset to token level bag-of-words feature vector and labels. 
    The weights obtained from here will be used later as emission scores for the viterbi tagger.
    
    inputs: train_file: input file to obtain the nb_weights from
    smoothing: value of smoothing for the naive_bayes weights
    
    :returns: nb_weights: naive bayes weights
    """
    token_level_docs=[]
    token_level_tags=[]
    for words,tags in preproc.conll_seq_generator(trainfile):
        token_level_docs += [{word:1} for word in words]
        token_level_tags +=tags
    nb_weights = estimate_nb(token_level_docs, token_level_tags, smoothing)
    
    return nb_weights
Exemplo n.º 9
0
def get_tag_trans_counts(trainfile):
    """compute a dict of counters for tag transitions

    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict

    """

    tot_counts = defaultdict(lambda: Counter())

    for words, tags in conll_seq_generator(trainfile):
        for index, tag in enumerate(tags):
            if index == 0:
                tot_counts[START_TAG].update([tag])
            if index == len(tags) - 1:
                tot_counts[tag].update([END_TAG])
            else:
                tot_counts[tag].update([tags[index + 1]])

    return dict(tot_counts)
Exemplo n.º 10
0
def get_word_to_ix(input_file, max_size=100000):
    """
    creates a vocab that has the list of most frequent occuring words such that the size of the vocab <=max_size, 
    also adds an UNK token to the Vocab and then creates a dictionary that maps each word to a unique index, 
    :returns: vocab, dict
    vocab: list of words in the vocabulary
    dict: maps word to unique index
    """
    vocab_counter = Counter()
    for words, tags in conll_seq_generator(input_file):
        for word, tag in zip(words, tags):
            vocab_counter[word] += 1
    vocab = [word for word, val in vocab_counter.most_common(max_size - 1)]
    vocab.append(UNK)

    word_to_ix = {}
    ix = 0
    for word in vocab:
        word_to_ix[word] = ix
        ix += 1

    return vocab, word_to_ix
Exemplo n.º 11
0
def get_tag_to_ix(input_file):
    """
    creates a dictionary that maps each tag (including the START_TAG and END_TAG to a unique index and vice-versa
    :returns: dict1, dict2
    dict1: maps tag to unique index
    dict2: maps each unique index to its own tag
    """
    tag_to_ix = {}
    for i, (words, tags) in enumerate(conll_seq_generator(input_file)):
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)

    #adding START_TAG and END_TAG
    #if START_TAG not in tag_to_ix:
    #    tag_to_ix[START_TAG] = len(tag_to_ix)
    #if END_TAG not in tag_to_ix:
    #    tag_to_ix[END_TAG] = len(tag_to_ix)

    ix_to_tag = {v: k for k, v in tag_to_ix.items()}

    return tag_to_ix, ix_to_tag
Exemplo n.º 12
0
def get_tag_word_counts(trainfile):
    """
    Produce a Counter of occurences of word for each tag
    
    Parameters:
    trainfile: -- the filename to be passed as argument to conll_seq_generator
    :returns: -- a default dict of counters, where the keys are tags.
    """
    all_counters = defaultdict(lambda: Counter())

    for words, tags in conll_seq_generator(trainfile):
        for i in range(len(tags)):
            counter = all_counters.get(tags[i])
            if counter is None:
                counter = Counter()

            counter[words[i]] += 1
            all_counters[tags[i]] = counter


#             all_counters[tags[i]][words[i]] += 1

    return all_counters