def augment_with_pretrained(dictionary, ext_emb_path, words): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print 'Loading pretrained embeddings from %s...' % ext_emb_path assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if words is None: for word in pretrained: if word not in dictionary: dictionary[word] = 0 else: for word in words: if any(x in pretrained for x in [ word, word.lower(), re.sub('\d', '0', word.lower()) ]) and word not in dictionary: dictionary[word] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) print "Found %i unique characters" % len(dico) return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique named entity tags" % len(dico) return dico, tag_to_id, id_to_tag
def cluster_mapping(sentences): """ Create a dictionary and mapping of clusters, sorted by frequency. """ clusters = [[word[4] for word in s] for s in sentences] dico = create_dico(clusters) cluster_to_id, id_to_cluster = create_mapping(dico) print "Found %i clusters" % len(dico) return dico, cluster_to_id, id_to_cluster
def POStag_mapping(sentences): """ Create a dictionary and mapping of POS tags, sorted by frequency. """ POStags = [[word[1] for word in s] for s in sentences] dico = create_dico(POStags) POStag_to_id, id_to_POStag = create_mapping(dico) print "Found %i POS tags" % len(dico) return dico, POStag_to_id, id_to_POStag
def parse_sentence(sentence, tokenizer, encoder, use_cuda=True): '''Implement the match part of MAMA ''' tokenizer_name = str(tokenizer.__str__) inputs, tokenid2word_mapping, token2id, noun_chunks = create_mapping( sentence, return_pt=True, tokenizer=tokenizer) with torch.no_grad(): if use_cuda: for key in inputs.keys(): inputs[key] = inputs[key].cuda() outputs = encoder(**inputs, output_attentions=True) trim = True if 'GPT2' in tokenizer_name: trim = False ''' Use average of last layer attention : page 6, section 3.1.2 ''' attention = process_matrix(outputs[2], avg_head=True, trim=trim, use_cuda=use_cuda) merged_attention = compress_attention(attention, tokenid2word_mapping) attn_graph = build_graph(merged_attention) tail_head_pairs = [] for head in noun_chunks: for tail in noun_chunks: if head != tail: tail_head_pairs.append((token2id[head], token2id[tail])) black_list_relation = set([token2id[n] for n in noun_chunks]) all_relation_pairs = [] id2token = {value: key for key, value in token2id.items()} with Pool(10) as pool: params = [( pair[0], pair[1], attn_graph, max(tokenid2word_mapping), black_list_relation, ) for pair in tail_head_pairs] for output in pool.imap_unordered(bfs, params): if len(output): all_relation_pairs += [(o, id2token) for o in output] triplet_text = [] with Pool(10) as pool: for triplet in pool.imap_unordered(filter_relation_sets, all_relation_pairs): if len(triplet) > 0: triplet_text.append(triplet) return triplet_text
def chunk_mapping(sentences, col=2): """ Create a dictionary and a mapping of chunk tags, sorted by frequency. """ tags = [[word[col] for word in s] for s in sentences] dico = create_dico(tags) chunk_to_id, id_to_chunk = create_mapping(dico) print "Found %i unique Chunk tags" % len(dico) return dico, chunk_to_id, id_to_chunk
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % (len(dico))) return dico, tag_to_id, id_to_tag
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) print("Found %i unique characters" % len(dico)) return dico, char_to_id, id_to_char
def dep_mapping(sentences): """ Create a dictionary and a mapping of dep tags, sorted by frequency. """ tags = [[word[4] for word in s] for s in sentences] dico = create_dico(tags) print dico tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique Dependency Role tags" % len(dico) return dico, tag_to_id, id_to_tag
def mor_mapping(sentences): """ Create a dictionary and a mapping of pos, sorted by frequency. """ tags = [[word[2] for word in s] for s in sentences] dico = create_dico(tags) dico['<UNK>'] = 10000000 mor_to_id, id_to_mor = create_mapping(dico) print("Found %i unique causality pos" % len(dico)) return dico, mor_to_id, id_to_mor
def dep_verb_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[4].split('|')[0] for word in s] for s in sentences] dico = create_dico(tags) dico['<UNK>'] = 10000000 pos_to_id, id_to_pos = create_mapping(dico) print "Found %i unique verb dep words" % len(dico) return dico, pos_to_id, id_to_pos
def word_mapping(sentences, lower, vocabulary_size, pre_train=None): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) word_to_id, id_to_word = create_mapping(dico, vocabulary_size) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in words))) if pre_train: emb_dictionary = read_pre_training(pre_train) for word in dico.iterkeys(): if word not in emb_dictionary: dico[word] = 0 dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico, vocabulary_size) return dico, word_to_id, id_to_word
def pos_mapping(sentences, position=1): """ Create a dictionary and a mapping of poss, sorted by frequency. """ # tags = [[word[position][0] for word in s] for s in sentences] tags = [[word[position] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique named entity tags" % len(dico) return dico, tag_to_id, id_to_tag
def loc_mapping(sentences): """ Create a dictionary and a mapping of location labels, sorted by frequency. """ loc = [[x[6] for x in s] for s in sentences] dico = create_dico(loc) dico['<UNK>'] = 10000000 loc_to_id, id_to_loc = create_mapping(dico) print("Found %i unique location (%i in total)" % (len(dico), sum(len(x) for x in loc))) return dico, loc_to_id, id_to_loc
def char_mapping(sentences): """ Create a dictionary and a mapping of chars, sorted by frequency. """ chars = [[x[0] for x in s] for s in sentences] dico = create_dico(chars) dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print "Found %i unique chars (%i in total)" % (len(dico), sum(len(x) for x in chars)) return dico, char_to_id, id_to_char
def pos_mapping(sentences): """ Create a dictionary and a mapping of pos tags, sorted by frequency. """ pos_tags = [[x[1] for x in s] for s in sentences] dico = create_dico(pos_tags) dico['<UNKPOS>'] = sys.maxint pos_tag_to_id, id_to_pos_tag = create_mapping(dico) print "Found %i unique pos tags (%i in total)" % ( len(dico), sum(len(x) for x in pos_tags)) return dico, pos_tag_to_id, id_to_pos_tag
def pos_mapping(sentences): """ Create a dictionary and a mapping of words, sorted by frequency. """ pos = [[x[1] for x in s] for s in sentences] dico = create_dico(pos) dico['<UNK>'] = 10000000 pos_to_id, id_to_pos = create_mapping(dico) print "Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in pos)) return dico, pos_to_id, id_to_pos
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) # SWM: replace unseen characters with special symbol (hopefully seen in training) char_to_id = defaultdict(lambda: 0, char_to_id) print "Found %i unique characters" % len(dico) return dico, char_to_id, id_to_char
def semroles_mapping(sentences): """ Create a dictionary and a mapping of semantic roles labels, sorted by frequency. """ semroles = [[x[4] for x in s] for s in sentences] dico = create_dico(semroles) dico['<UNK>'] = 10000000 semroles_to_id, id_to_semroles = create_mapping(dico) print("Found %i unique semroles (%i in total)" % (len(dico), sum(len(x) for x in semroles))) return dico, semroles_to_id, id_to_semroles
def depNode_mapping(sentences): """ Create a dictionary and a mapping of dependency node labels, sorted by frequency. """ depNode = [[x[3] for x in s] for s in sentences] dico = create_dico(depNode) dico['<UNK>'] = 10000000 depNode_to_id, id_to_depNode = create_mapping(dico) print("Found %i unique depNode (%i in total)" % (len(dico), sum(len(x) for x in depNode))) return dico, depNode_to_id, id_to_depNode
def conNode_mapping(sentences): """ Create a dictionary and a mapping of chunk labels, sorted by frequency. """ conNode = [[x[2] for x in s] for s in sentences] dico = create_dico(conNode) dico['<UNK>'] = 10000000 conNode_to_id, id_to_conNode = create_mapping(dico) print("Found %i unique conNode (%i in total)" % (len(dico), sum(len(x) for x in conNode))) return dico, conNode_to_id, id_to_conNode
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = [''.join([w[0] for w in sentence]) for sentence in sentences] dico = create_dico(chars) dico['<PAD>'] = 1000000 id_to_char, char_to_id = create_mapping(dico) return dico, char_to_id, id_to_char
def _get_loc_var(self): self.loc_var_map = {} for series in self.series_list: header = series.find(self.default_ns + "header") param = header.findtext(self.default_ns + "parameterId") loc_id = header.findtext(self.default_ns + "locationId") # if loc_id not in self.loc_var_map.keys(): # self.loc_var_map[loc_id] = [] # self.loc_var_map[loc_id].append(param) self.loc_var_map[loc_id] = create_mapping(loc_id, param, self.loc_var_map)
def _get_var_loc(self): self.var_loc_map = {} for series in self.series_list: header = series.find(self.default_ns + "header") param = header.findtext(self.default_ns + "parameterId") loc_id = header.findtext(self.default_ns + "locationId") # if param not in self.var_loc_map.keys(): # self.var_loc_map[param] = [] # self.var_loc_map[param].append(loc_id) self.var_loc_map[param] = create_mapping(param, loc_id, self.var_loc_map)
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in words))) return dico, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [word[-1] for s in sentences for word in s] dico = dict(Counter(tags)) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def pos_mapping(sentences, col=1): """ Create a dictionary and a mapping of pos tags, sorted by frequency. """ tags = [[word[col] for word in s] for s in sentences] dico = create_dico(tags) pos_to_id, id_to_pos = create_mapping(dico) print "Found %i unique POS tags" % len(dico) return dico, pos_to_id, id_to_pos
def word_mapping(sentences,vocabulary_size, pre_train = None): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0] for x in s] for s in sentences] dico = create_dico(words) word_to_id, id_to_word = create_mapping(dico, vocabulary_size) print ("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in words)) ) if pre_train: emb_dictionary = read_pre_training(pre_train) for word in dico.iterkeys(): if word not in emb_dictionary: dico[word]=0 dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico, vocabulary_size) return dico, word_to_id, id_to_word
def head_mapping(sentences): """ Create a dictionary and a mapping of head tags, sorted by frequency. """ tags = [[word[3] for word in s] for s in sentences] dico = create_dico(tags) dico['MAX'] = 10000000 print dico tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique Head index tags" % len(dico) return dico, tag_to_id, id_to_tag
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in words) )) return dico, word_to_id, id_to_word
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ''.join([w[0] for s in sentences for w in s]) dico = dict(Counter(chars)) dico['<PAD>'] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique characters" % len(dico)) return dico, char_to_id, id_to_char
def tag_mapping(data_path, data_type): """ Create a dictionary and a mapping of tags, sorted by frequency. """ with open(data_path+data_type+"_labels.txt", "r") as file1: tags = [line.split(" ")[:-1] for line in file1.readlines()] dico = create_dico(tags) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 id_to_tag, tag_to_id = create_mapping(dico) return dico, tag_to_id, id_to_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) dico['[SEP]'] = len(dico) + 1 dico['[CLS]'] = len(dico) + 2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag