Exemplo n.º 1
0
def get_tag_word_counts(trainfile):
    """
    Produce a Counter of occurences of word for each tag
    
    Parameters:
    trainfile: -- the filename to be passed as argument to conll_seq_generator
    :returns: -- a default dict of counters, where the keys are tags.
    """
    all_counters = defaultdict(lambda: Counter())


    # Put some code here!

    for i,(word,tag) in enumerate(conll_seq_generator(trainfile)):

        for j,k in zip(word,tag):
           all_counters[k][j] += 1
           #all_counters[j][k] += 1






    return all_counters
Exemplo n.º 2
0
def setup():
    global word_to_ix, tag_to_ix, X_tr, Y_tr, model, embedding_dim
    
    vocab, word_to_ix = most_common.get_word_to_ix(TRAIN_FILE, max_size=6500)
    tag_to_ix={}
    for i,(words,tags) in enumerate(preprocessing.conll_seq_generator(TRAIN_FILE)):
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    
    
    torch.manual_seed(765);
    
    embedding_dim=30
    hidden_dim=30
    model = bilstm.BiLSTM(len(word_to_ix),tag_to_ix,embedding_dim, hidden_dim)
    
    X_tr = []
    Y_tr = []
    for i,(words,tags) in enumerate(preprocessing.conll_seq_generator(TRAIN_FILE)):
        X_tr.append(words)
        Y_tr.append(tags)
Exemplo n.º 3
0
def setup():
    global word_to_ix, tag_to_ix, X_tr, Y_tr, model, embedding_dim
    
    vocab, word_to_ix = most_common.get_word_to_ix(TRAIN_FILE, max_size=6500)
                
    X_tr = []
    Y_tr = []
    for i,(words,tags) in enumerate(preprocessing.conll_seq_generator(TRAIN_FILE)):
        X_tr.append(words)
        Y_tr.append(tags)
    
    torch.manual_seed(765);
    
    embedding_dim=30
    hidden_dim=30
    model = cbow.CBOW(len(vocab), embedding_dim)
Exemplo n.º 4
0
def get_tag_trans_counts(input_file):
    """compute a dict of counters for tag transitions
    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict
    """

    tot_counts = defaultdict(lambda: Counter())

    for index, (words, tags) in enumerate(conll_seq_generator(input_file)):
        for index, tag in enumerate(tags):
            if index == 0:
                tot_counts[START_TAG].update([tag])
            if index == len(tags) - 1:
                tot_counts[tag].update([END_TAG])
            else:
                tot_counts[tag].update([tags[index + 1]])

    return dict(tot_counts)
Exemplo n.º 5
0
def get_nb_weights(trainfile, smoothing):
    """
    estimate_nb function assumes that the labels are one for each document, where as in POS tagging: we have labels for 
    each particular token. So, in order to calculate the emission score weights: P(w|y) for a particular word and a 
    token, we slightly modify the input such that we consider each token and its tag to be a document and a label. 
    The following helper code converts the dataset to token level bag-of-words feature vector and labels. 
    The weights obtained from here will be used later as emission scores for the viterbi tagger.
    
    inputs: train_file: input file to obtain the nb_weights from
    smoothing: value of smoothing for the naive_bayes weights
    
    :returns: nb_weights: naive bayes weights
    """
    token_level_docs = []
    token_level_tags = []
    for words, tags in preprocessing.conll_seq_generator(trainfile):
        token_level_docs += [{word: 1} for word in words]
        token_level_tags += tags
    nb_weights = estimate_nb(token_level_docs, token_level_tags, smoothing)

    return nb_weights
Exemplo n.º 6
0
def get_word_to_ix(input_file, max_size=100000):
    """
    creates a vocab that has the list of most frequent occuring words such that the size of the vocab <=max_size, 
    also adds an UNK token to the Vocab and then creates a dictionary that maps each word to a unique index, 
    :returns: vocab, dict
    vocab: list of words in the vocabulary
    dict: maps word to unique index
    """
    vocab_counter=Counter()
    for words,tags in conll_seq_generator(input_file):
        for word,tag in zip(words,tags):
            vocab_counter[word]+=1
    vocab = [ word for word,val in vocab_counter.most_common(max_size-1)]
    vocab.append(UNK)
    
    word_to_ix={}
    ix=0
    for word in vocab:
        word_to_ix[word]=ix
        ix+=1
    
    return vocab, word_to_ix
Exemplo n.º 7
0
def get_tag_to_ix(input_file):
    """
    creates a dictionary that maps each tag (including the START_TAG and END_TAG to a unique index and vice-versa
    :returns: dict1, dict2
    dict1: maps tag to unique index
    dict2: maps each unique index to its own tag
    """
    tag_to_ix={}
    for i,(words,tags) in enumerate(conll_seq_generator(input_file)):
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    
    #adding START_TAG and END_TAG
    #if START_TAG not in tag_to_ix:
    #    tag_to_ix[START_TAG] = len(tag_to_ix)
    #if END_TAG not in tag_to_ix:
    #    tag_to_ix[END_TAG] = len(tag_to_ix)
    
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    
    return tag_to_ix, ix_to_tag