Пример #1
0
def augment_with_pretrained(dictionary, ext_emb_path, words):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print 'Loading pretrained embeddings from %s...' % ext_emb_path
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if words is None:
        for word in pretrained:
            if word not in dictionary:
                dictionary[word] = 0
    else:
        for word in words:
            if any(x in pretrained for x in [
                word,
                word.lower(),
                re.sub('\d', '0', word.lower())
            ]) and word not in dictionary:
                dictionary[word] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
Пример #2
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print "Found %i unique characters" % len(dico)
    return dico, char_to_id, id_to_char
Пример #3
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique named entity tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Пример #4
0
def cluster_mapping(sentences):
    """
    Create a dictionary and mapping of clusters, sorted by frequency.
    """
    clusters = [[word[4] for word in s] for s in sentences]
    dico = create_dico(clusters)
    cluster_to_id, id_to_cluster = create_mapping(dico)
    print "Found %i clusters" % len(dico)
    return dico, cluster_to_id, id_to_cluster
Пример #5
0
def POStag_mapping(sentences):
    """
    Create a dictionary and mapping of POS tags, sorted by frequency.
    """
    POStags = [[word[1] for word in s] for s in sentences]
    dico = create_dico(POStags)
    POStag_to_id, id_to_POStag = create_mapping(dico)
    print "Found %i POS tags" % len(dico)
    return dico, POStag_to_id, id_to_POStag
Пример #6
0
def parse_sentence(sentence, tokenizer, encoder, use_cuda=True):
    '''Implement the match part of MAMA
    '''
    tokenizer_name = str(tokenizer.__str__)

    inputs, tokenid2word_mapping, token2id, noun_chunks = create_mapping(
        sentence, return_pt=True, tokenizer=tokenizer)

    with torch.no_grad():
        if use_cuda:
            for key in inputs.keys():
                inputs[key] = inputs[key].cuda()
        outputs = encoder(**inputs, output_attentions=True)
    trim = True
    if 'GPT2' in tokenizer_name:
        trim = False
    '''
    Use average of last layer attention : page 6, section 3.1.2
    '''
    attention = process_matrix(outputs[2],
                               avg_head=True,
                               trim=trim,
                               use_cuda=use_cuda)

    merged_attention = compress_attention(attention, tokenid2word_mapping)
    attn_graph = build_graph(merged_attention)

    tail_head_pairs = []
    for head in noun_chunks:
        for tail in noun_chunks:
            if head != tail:
                tail_head_pairs.append((token2id[head], token2id[tail]))

    black_list_relation = set([token2id[n] for n in noun_chunks])

    all_relation_pairs = []
    id2token = {value: key for key, value in token2id.items()}

    with Pool(10) as pool:
        params = [(
            pair[0],
            pair[1],
            attn_graph,
            max(tokenid2word_mapping),
            black_list_relation,
        ) for pair in tail_head_pairs]
        for output in pool.imap_unordered(bfs, params):
            if len(output):
                all_relation_pairs += [(o, id2token) for o in output]

    triplet_text = []
    with Pool(10) as pool:
        for triplet in pool.imap_unordered(filter_relation_sets,
                                           all_relation_pairs):
            if len(triplet) > 0:
                triplet_text.append(triplet)
    return triplet_text
Пример #7
0
def chunk_mapping(sentences, col=2):
    """
    Create a dictionary and a mapping of chunk tags, sorted by frequency.
    """
    tags = [[word[col] for word in s] for s in sentences]
    dico = create_dico(tags)
    chunk_to_id, id_to_chunk = create_mapping(dico)
    print "Found %i unique Chunk tags" % len(dico)
    return dico, chunk_to_id, id_to_chunk
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % (len(dico)))
    return dico, tag_to_id, id_to_tag
Пример #9
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char
Пример #10
0
def dep_mapping(sentences):
    """
    Create a dictionary and a mapping of dep tags, sorted by frequency.
    """
    tags = [[word[4] for word in s] for s in sentences]
    dico = create_dico(tags)
    print dico
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique Dependency Role tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Пример #11
0
def mor_mapping(sentences):
    """
    Create a dictionary and a mapping of pos, sorted by frequency.
    """
    tags = [[word[2] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['<UNK>'] = 10000000
    mor_to_id, id_to_mor = create_mapping(dico)
    print("Found %i unique causality pos" % len(dico))
    return dico, mor_to_id, id_to_mor
Пример #12
0
def dep_verb_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[4].split('|')[0] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['<UNK>'] = 10000000
    pos_to_id, id_to_pos = create_mapping(dico)
    print "Found %i unique verb dep words" % len(dico)
    return dico, pos_to_id, id_to_pos
Пример #13
0
def word_mapping(sentences, lower, vocabulary_size, pre_train=None):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    word_to_id, id_to_word = create_mapping(dico, vocabulary_size)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in words)))

    if pre_train:
        emb_dictionary = read_pre_training(pre_train)
        for word in dico.iterkeys():
            if word not in emb_dictionary:
                dico[word] = 0

    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico, vocabulary_size)
    return dico, word_to_id, id_to_word
Пример #14
0
def pos_mapping(sentences, position=1):
    """
    Create a dictionary and a mapping of poss, sorted by frequency.
    """
    # tags = [[word[position][0] for word in s] for s in sentences]
    tags = [[word[position] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique named entity tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Пример #15
0
def loc_mapping(sentences):
    """
    Create a dictionary and a mapping of location labels, sorted by frequency.
    """
    loc = [[x[6] for x in s] for s in sentences]
    dico = create_dico(loc)
    dico['<UNK>'] = 10000000
    loc_to_id, id_to_loc = create_mapping(dico)
    print("Found %i unique location (%i in total)" %
          (len(dico), sum(len(x) for x in loc)))
    return dico, loc_to_id, id_to_loc
Пример #16
0
def char_mapping(sentences):
    """
    Create a dictionary and a mapping of chars, sorted by frequency.
    """
    chars = [[x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print "Found %i unique chars (%i in total)" % (len(dico),
                                                   sum(len(x) for x in chars))
    return dico, char_to_id, id_to_char
Пример #17
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of pos tags, sorted by frequency.
    """
    pos_tags = [[x[1] for x in s] for s in sentences]
    dico = create_dico(pos_tags)
    dico['<UNKPOS>'] = sys.maxint
    pos_tag_to_id, id_to_pos_tag = create_mapping(dico)
    print "Found %i unique pos tags (%i in total)" % (
        len(dico), sum(len(x) for x in pos_tags))
    return dico, pos_tag_to_id, id_to_pos_tag
Пример #18
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    pos = [[x[1] for x in s] for s in sentences]
    dico = create_dico(pos)
    dico['<UNK>'] = 10000000
    pos_to_id, id_to_pos = create_mapping(dico)
    print "Found %i unique words (%i in total)" % (len(dico),
                                                   sum(len(x) for x in pos))
    return dico, pos_to_id, id_to_pos
Пример #19
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    # SWM: replace unseen characters with special symbol (hopefully seen in training)
    char_to_id = defaultdict(lambda: 0, char_to_id)
    print "Found %i unique characters" % len(dico)
    return dico, char_to_id, id_to_char
Пример #20
0
def semroles_mapping(sentences):
    """
    Create a dictionary and a mapping of semantic roles labels, sorted by frequency.
    """
    semroles = [[x[4] for x in s] for s in sentences]
    dico = create_dico(semroles)
    dico['<UNK>'] = 10000000
    semroles_to_id, id_to_semroles = create_mapping(dico)
    print("Found %i unique semroles (%i in total)" %
          (len(dico), sum(len(x) for x in semroles)))
    return dico, semroles_to_id, id_to_semroles
Пример #21
0
def depNode_mapping(sentences):
    """
    Create a dictionary and a mapping of dependency node labels, sorted by frequency.
    """
    depNode = [[x[3] for x in s] for s in sentences]
    dico = create_dico(depNode)
    dico['<UNK>'] = 10000000
    depNode_to_id, id_to_depNode = create_mapping(dico)
    print("Found %i unique depNode (%i in total)" %
          (len(dico), sum(len(x) for x in depNode)))
    return dico, depNode_to_id, id_to_depNode
Пример #22
0
def conNode_mapping(sentences):
    """
    Create a dictionary and a mapping of chunk labels, sorted by frequency.
    """
    conNode = [[x[2] for x in s] for s in sentences]
    dico = create_dico(conNode)
    dico['<UNK>'] = 10000000
    conNode_to_id, id_to_conNode = create_mapping(dico)
    print("Found %i unique conNode (%i in total)" %
          (len(dico), sum(len(x) for x in conNode)))
    return dico, conNode_to_id, id_to_conNode
Пример #23
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = [''.join([w[0] for w in sentence]) for sentence in sentences]
    
    dico = create_dico(chars)
    dico['<PAD>'] = 1000000

    id_to_char, char_to_id = create_mapping(dico)
    return dico, char_to_id, id_to_char
Пример #24
0
 def _get_loc_var(self):
     self.loc_var_map = {}
     for series in self.series_list:
         header = series.find(self.default_ns + "header")
         param = header.findtext(self.default_ns + "parameterId")
         loc_id = header.findtext(self.default_ns + "locationId")
         # if loc_id not in self.loc_var_map.keys():
         #     self.loc_var_map[loc_id] = []
         # self.loc_var_map[loc_id].append(param)
         self.loc_var_map[loc_id] = create_mapping(loc_id, param,
                                                   self.loc_var_map)
Пример #25
0
 def _get_var_loc(self):
     self.var_loc_map = {}
     for series in self.series_list:
         header = series.find(self.default_ns + "header")
         param = header.findtext(self.default_ns + "parameterId")
         loc_id = header.findtext(self.default_ns + "locationId")
         # if param not in self.var_loc_map.keys():
         #     self.var_loc_map[param] = []
         # self.var_loc_map[param].append(loc_id)
         self.var_loc_map[param] = create_mapping(param, loc_id,
                                                  self.var_loc_map)
Пример #26
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in words)))
    return dico, word_to_id, id_to_word
Пример #27
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [word[-1] for s in sentences for word in s]
    dico = dict(Counter(tags))
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Пример #28
0
def pos_mapping(sentences, col=1):
    """
    Create a dictionary and a mapping of pos tags, sorted by frequency.
    """
    tags = [[word[col] for word in s] for s in sentences]
    dico = create_dico(tags)
    pos_to_id, id_to_pos = create_mapping(dico)

    print "Found %i unique POS tags" % len(dico)

    return dico, pos_to_id, id_to_pos
def word_mapping(sentences,vocabulary_size, pre_train = None):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    word_to_id, id_to_word = create_mapping(dico, vocabulary_size)
    print ("Found %i unique words (%i in total)" %
        (len(dico), sum(len(x) for x in words))
    )

    if pre_train:
        emb_dictionary = read_pre_training(pre_train)
        for word in dico.iterkeys():
        	  if word not in emb_dictionary:
        	  	  dico[word]=0

    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico, vocabulary_size)
    return dico, word_to_id, id_to_word
Пример #30
0
def head_mapping(sentences):
    """
    Create a dictionary and a mapping of head tags, sorted by frequency.
    """
    tags = [[word[3] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['MAX'] = 10000000
    print dico
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique Head index tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Пример #31
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))
    return dico, word_to_id, id_to_word
Пример #32
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ''.join([w[0] for s in sentences for w in s])
    dico = dict(Counter(chars))
    dico['<PAD>'] = 10000001
    dico['<UNK>'] = 10000000

    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char
def tag_mapping(data_path, data_type):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    with open(data_path+data_type+"_labels.txt", "r") as file1:
        tags = [line.split(" ")[:-1] for line in file1.readlines()]
    dico = create_dico(tags)
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Пример #34
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    
    dico = create_dico(tags)
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2

    id_to_tag, tag_to_id = create_mapping(dico)
    return dico, tag_to_id, id_to_tag
Пример #35
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]

    dico = create_dico(tags)
    dico['[SEP]'] = len(dico) + 1
    dico['[CLS]'] = len(dico) + 2

    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag