def init_vocab(self, emb_configs, vocab_size): # Load several pretrained embeddings and concatenate them. pretrained = [self.load(c['path'], vocab_size, c['size'], c['skip_first']) for c in emb_configs] rev_vocab = common.flatten([list(e.keys()) for e in pretrained]) rev_vocab = self.start_vocab + rev_vocab[:vocab_size] vocab = collections.OrderedDict() for i,t in enumerate(rev_vocab): vocab[t] = i # Merge pretrained embeddings. if self.normalize_embedding: # Normalize the pretrained embeddings for each of the embedding types. embeddings = [common.flatten([common.normalize_vector(emb[w]) for emb in pretrained]) for w in vocab] else: embeddings = [common.flatten([emb[w] for emb in pretrained]) for w in vocab] # tokens in START_VOCAB are randomly initialized. #rand_gen = random_embedding_generator(len(embeddings[0])) #for i in range(len(self.start_vocab)): # embeddings[i] = rand_gen() embeddings = np.array(embeddings) sys.stderr.write("Done loading word embeddings.\n") return vocab, rev_vocab, embeddings
def read_text(text, tokenizer): if isinstance(text, list): assert type(text[0]) == str words = flatten(tokenizer(sent) for sent in text) else: words = tokenizer(text) return words
def merge(self, pretrained, vocab_size, vocab_merge_type='union', embedding_merge_type='first_found'): ''' <Args> - pretrained: A list of dictionary, {word: vector}. - vocab_merge_type: ['union', 'intersection'] - embedding_merge_type: ['first_found', 'average', 'concat'] ''' if vocab_merge_type != 'union': raise NotImplementedError if embedding_merge_type != 'first_found': raise NotImplementedError rev_vocab = list( OrderedSet(common.flatten([list(v.keys()) for v in pretrained]))) # union. embedding_size = len(pretrained[0][rev_vocab[0]]) embedding_dict = defaultdict(zero_embedding_generator(embedding_size)) for w in rev_vocab: embedding_dict[w] = [vecs[w] for vecs in pretrained if w in vecs][0] return embedding_dict
def decorate_text(self_class, example, vocab, prediction=None): ''' Args: - example: A recDotDefaultDict, one example of a flattened batch. Refer to WikiP2DRelExDataset.article2entries. ''' text = copy.deepcopy(example.text.flat) query = example.query for i, w in enumerate(text): if vocab.word.is_unk(w): text[i] = UNDERLINE + text[i] query_positions = set([ j for j in range(query.flat_position[0], query.flat_position[1] + 1) ]) gold_mention_positions = set( flatten([[j for j in range(begin, end + 1)] for begin, end in example.mentions.flat_position])) if PAD_ID in gold_mention_positions: gold_mention_positions.remove(PAD_ID) if i in query_positions: text[i] = MAGENTA + text[i] if i in gold_mention_positions: text[i] = BLUE + text[i] text[i] = text[i] + RESET return text #'\n'.join([' '.join(sent) for sent in text])
def get_input_feed(self, batch, is_training): input_feed = {} ## Texts if self.encoder.cbase: input_feed[self.ph.text.char] = batch.text.char if self.encoder.wbase: input_feed[self.ph.text.word] = batch.text.word input_feed[self.ph.sentence_length] = batch.sentence_length ## Mention spans and their clusters gold_mentions = sorted(tuple(m) for m in flatten(batch.clusters)) gold_mention_map = {m: i for i, m in enumerate(gold_mentions)} cluster_ids = np.zeros(len(gold_mentions)) for cluster_id, cluster in enumerate(batch.clusters): for mention in cluster: cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id gold_starts, gold_ends = self.tensorize_mentions(gold_mentions) input_feed[self.ph.gold_starts] = np.array(gold_starts) input_feed[self.ph.gold_ends] = np.array(gold_ends) input_feed[self.ph.cluster_ids] = np.array(cluster_ids) ## Metadata input_feed[self.is_training] = is_training input_feed[self.ph.speaker_ids] = batch.speakers input_feed[self.ph.genre] = batch.genre if is_training and batch.text.word.shape[ 0] > self.max_training_sentences: return self.truncate_example(input_feed) else: return input_feed
def preprocess(self, article): raw_text = [s.split() for s in article.text] num_words = [len(s) for s in raw_text] article.text = raw_text article.flat_text = flatten(raw_text) article.desc = article.desc.split() article.num_words = sum([len(s) for s in article.text]) return article
def main(args): word_embs = read_embedding(args.source_emb) data = read_jsonlines(args.dataset_path, max_rows=0) tokenizer = word_tokenizer(args.lowercase, args.normalize_digits) words = flatten([read_text(d.text, tokenizer) for d in data]) word_freq = sorted(Counter(words).items(), key=lambda x: -x[1]) for word, freq in word_freq: if word in word_embs: line = [word] + word_embs[word] line = ' '.join([str(x) for x in line]) print(line)
def _tokenizer(sent, flatten=False): if normalize_digits: sent = re.sub(_DIGIT_RE, "0", sent) if lowercase: sent = sent.lower() def word2chars(word): return [c for c in word] words = sent.replace('\n', '').split() chars = [word2chars(w) for w in words] if flatten: chars = common.flatten(chars) return chars
def main(args): tokenizer = word_tokenizer(args.lowercase, args.normalize_digits, separative_tokens=['-', '/']) data = read_jsonlines(args.descdata_path) word_freq = OrderedDict(sorted([(k, freq) for k, freq in Counter(flatten([tokenizer(d.desc) for d in data])).items()], key=lambda x: -x[1])) embedding_dict = read_pretrained_emb(word_freq, tokenizer) with open(args.emb_target_path, 'w') as f: for w, v in embedding_dict.items(): if not v: continue line = "%s %s\n" % (w, ' '.join([str(x) for x in v])) f.write(line)
def setup_tokenizer(tokenizer_type=None): assert tokenizer_type is None or tokenizer_type in ['corenlp', 'nltk'] if tokenizer_type == 'corenlp': #from core.utils.tokenizer import connect_to_corenlp_server, run_corenlp corenlp = connect_to_corenlp_server(host='http://localhost', port=9000) tokenizer = lambda uttr: flatten(run_corenlp(uttr, corenlp)) elif tokenizer_type == 'nltk': from nltk import word_tokenize tokenizer = word_tokenize else: tokenizer = lambda uttr: uttr.split() # tokenizer must return a list of words. return tokenizer
def preprocess(self, article): raw_text = [s.split() for s in article.text] num_words = [len(s) for s in raw_text] links = {} # Convert a list of sentneces to a flattened sequence of words. for qid, link in article.link.items(): (sent_id, (begin, end)) = link flatten_begin = begin + sum(num_words[:sent_id]) flatten_end = end + sum(num_words[:sent_id]) assert flatten_begin >= 0 and flatten_end >= 0 links[qid] = (flatten_begin, flatten_end) article.link = links article.text = flatten(raw_text) article.desc = article.desc.split() return article
def _tokenizer(sent, flatten=False): if split_quotation: sent = sent.replace("'", " ' ") if normalize_digits: sent = re.sub(_DIGIT_RE, "0", sent) if lowercase: sent = sent.lower() def word2chars(word): return [c for c in word] words = sent.replace('\n', '') words = words.split() if not use_nltk_tokenizer else word_tokenize( words) chars = [word2chars(w) for w in words] if flatten: chars = common.flatten(chars) return chars
def debug(self, model=None): if not model: model = self.create_model(self.config) self.output_variables_as_text(model) exit(1) batch = common.recDotDefaultDict() state = [common.flatten([[1, 0, 0, 0] for _ in range(160)])] batch.state = state batch.is_sente = [[1, 0] for _ in state] batch.current_num_cards = [[1, 1] for s in state] batch.is_training = False res = model.step(batch, 0) print(res) batches = self.dataset.get_batches(self.config.batch_size, 0, is_training=True) for b in batches: b = common.flatten_recdict(b) for k in b: print(k, b[k]) exit(1)
def test(self, batches, mode, logger, output_path): results = [] used_batches = [] for i, batch in enumerate(batches): input_feed = self.get_input_feed(batch, False) outputs = self.sess.run(self.predictions, input_feed) try: used_batches += flatten_batch(batch) except Exception as e: pprint(batch) print(e) exit(1) results.append(outputs[:, 0, :]) results = flatten([r.tolist() for r in results]) sys.stdout = open(output_path, 'w') if output_path else sys.stdout bleu = evaluate_and_print(used_batches, results, vocab=self.vocab) if output_path: sys.stderr.write( "Output the testing results to \'{}\' .\n".format(output_path)) sys.stdout = sys.__stdout__ summary_dict = {} summary_dict['desc/%s/BLEU' % mode] = bleu summary = make_summary(summary_dict) return bleu, summary
def load_data(self): sys.stderr.write("Loading wikiP2D dataset from \'%s\'... \n" % self.source_path) data = read_jsonlines(self.source_path, max_rows=self.max_rows) data = [self.preprocess(d) for d in data] self.data = flatten([self.article2entries(d) for d in data])
def article2entries(self, article): def qid2entity(qid, article): assert qid in article.link s_id, (begin, end) = article.link[qid] # The offset is the number of words in previous sentences. offset = sum([len(sent) for sent in article.text[:s_id]]) entity = recDotDefaultDict() # Replace entity's name with the actual representation in the article. entity.raw = ' '.join(article.text[s_id][begin:end + 1]) entity.position = article.link[qid] entity.flat_position = (begin + offset, end + offset) return entity entry = recDotDefaultDict() entry.qid = article.qid entry.text.raw = article.text entry.text.flat = article.flat_text entry.text.word = [self.vocab.word.sent2ids(s) for s in article.text] entry.text.char = [self.vocab.char.sent2ids(s) for s in article.text] entry.query = qid2entity(article.qid, article) # (begin, end) # Articles which contain triples less than self.min_triples are discarded since they can be incorrect. if len(article.triples.subjective.ids) + len( article.triples.objective.ids) < self.min_triples: return [] entry.mentions.raw = [] entry.mentions.flat_position = [] for t_type in ['subjective', 'objective']: entry.triples[t_type] = [] entry.target[t_type] = [[ self.vocab.rel.UNK_ID for j in range(self.max_mention_width) ] for i in range(article.num_words)] for triple_idx, triple in enumerate( article.triples[t_type].ids): # triple = [subj, rel, obj] is_subjective = triple[0] == article.qid query_qid, rel_pid, mention_qid = triple if is_subjective else reversed( triple) # TODO: 同じメンションがクエリと異なる関係を持つ場合は? mention = qid2entity(mention_qid, article) #entry.mentions[t_type].raw.append(mention.raw) #entry.mentions[t_type].flat_position.append(mention.flat_position) entry.mentions.raw.append(mention.raw) entry.mentions.flat_position.append(mention.flat_position) rel = dotDict({ 'raw': rel_pid, 'name': self.vocab.rel.token2name(rel_pid) }) begin, end = mention.flat_position if end - begin < self.max_mention_width: entry.target[t_type][begin][ end - begin] = self.vocab.rel.token2id(rel_pid) triple = [entry.query, rel, mention ] if is_subjective else [mention, rel, entry.query] entry.triples[t_type].append(triple) relation_freqs = Counter(flatten(entry.target.subjective)) # TODO: For now this experiments focus only on subjective relations. entry.triples.objective = [] ##################### entry.loss_weights_by_label = [1.0 for _ in range(self.vocab.rel.size)] entry.num_mentions = len(entry.mentions.flat_position) return [entry]