def precision_recall(golds, predictions, N=1): assert len(golds) == len(predictions) golds = transpose(golds) predictions = transpose(predictions) res = [] precisions, recalls = [], [] for golds_per_col, preds_per_col in zip(golds, predictions): TP = [] FP = [] FN = [] for g, p in zip(golds_per_col, preds_per_col): tp = [] g_ngrams = flatten(get_ngram(g, 1, N)) p_ngrams = flatten(get_ngram(p, 1, N)) for pn in copy.deepcopy(p_ngrams): if pn in g_ngrams: tp.append(pn) g_ngrams.pop(g_ngrams.index(pn)) p_ngrams.pop(p_ngrams.index(pn)) fn = g_ngrams fp = p_ngrams TP.append(tp) FP.append(fp) FN.append(fn) TP = len(flatten(TP)) FP = len(flatten(FP)) FN = len(flatten(FN)) precisions.append(1.0*TP/(TP+FP)) recalls.append(1.0*TP/(TP+FN)) return precisions, recalls
def test(self, data): inputs = [] outputs = [] e_predictions = [] j_predictions = [] num_steps = 0 epoch_time = 0.0 for i, batch in enumerate(data): feed_dict = self.get_input_feed(batch, False) # for x,resx in zip(self.debug, self.sess.run(self.debug, feed_dict)): # print x # print resx.shape # exit(1) t = time.time() inputs.append(batch.texts) batch_predictions = self.sess.run(self.e_predictions, feed_dict) batch_predictions = np.transpose(batch_predictions, (0, 2, 1)) e_predictions.append(batch_predictions) batch_predictions = self.sess.run(self.j_predictions, feed_dict) batch_predictions = np.transpose(batch_predictions, (0, 2, 1)) j_predictions.append(batch_predictions) epoch_time += time.time() - t num_steps += 1 inputs = flatten(inputs) e_predictions = flatten(e_predictions) j_predictions = flatten(j_predictions) inputs = [self.vocab.e_word.id2sent(u, join=True) for u in inputs] outputs = inputs e_predictions = [[self.vocab.e_word.id2sent(r, join=True) for r in p] for p in e_predictions] j_predictions = [[self.vocab.j_word.id2sent(r, join=True) for r in p] for p in j_predictions] return (inputs, outputs, e_predictions, j_predictions), epoch_time
def create_vocab(self, texts, vocab_path, vocab_size=0): ''' Args: - vocab_path: The path to which the vocabulary will be restored. - texts: List of words. ''' start_vocab = self.start_vocab rev_vocab, freq = zip(*collections.Counter(texts).most_common()) rev_vocab = common.flatten([self.tokenizer(w) for w in rev_vocab]) if type(rev_vocab[0]) == list: rev_vocab = common.flatten(rev_vocab) rev_vocab = OrderedSet(start_vocab + rev_vocab) if vocab_size: rev_vocab = OrderedSet([w for i, w in enumerate(rev_vocab) if i < vocab_size]) freq = [0 for _ in start_vocab] + list(freq) freq = freq[:len(rev_vocab)] vocab = collections.OrderedDict() for i,t in enumerate(rev_vocab): vocab[t] = i # Restore vocabulary. if vocab_path is not None: with open(vocab_path, 'w') as f: for k, v in zip(rev_vocab, freq): if type(k) == unicode: k = k.encode('utf-8') f.write('%s\t%d\n' % (k,v)) return vocab, rev_vocab
def test(self, data): inputs = [] outputs = [] speaker_changes = [] predictions = [] num_steps = 0 epoch_time = 0.0 for i, batch in enumerate(data): feed_dict = self.get_input_feed(batch, False) # for x,resx in zip(self.debug, self.sess.run(self.debug, feed_dict)): # print x # print resx.shape # exit(1) t = time.time() batch_predictions = self.sess.run(self.predictions, feed_dict) epoch_time += time.time() - t num_steps += 1 inputs.append(batch.w_contexts) outputs.append(batch.responses) speaker_changes.append(batch.speaker_changes) predictions.append(batch_predictions) inputs = flatten(inputs) outputs = flatten(outputs) speaker_changes = flatten(speaker_changes) predictions = flatten(predictions) inputs = [[self.w_vocab.id2sent(u, join=True) for u in c] for c in inputs] outputs = [self.w_vocab.id2sent(r, join=True) for r in outputs] # [batch_size, utterance_max_len, beam_width] - > [batch_size, beam_width, utterance_max_len] predictions = [[self.w_vocab.id2sent(r, join=True) for r in zip(*p)] for p in predictions] speaker_changes = [BooleanVocab.id2sent(sc) for sc in speaker_changes] return (inputs, outputs, speaker_changes, predictions), epoch_time
def oov_rate(self): if not self.load: return None context_tokens = common.flatten(self.symbolized.w_contexts, depth=2) response_tokens = common.flatten(self.symbolized.responses) context_tokens = Counter(context_tokens) response_tokens = Counter(response_tokens) context_unk_rate = 1.0 * context_tokens[UNK_ID] / sum(context_tokens.values()) response_unk_rate = 1.0 * response_tokens[UNK_ID] / sum(response_tokens.values()) return context_unk_rate, response_unk_rate
def _matching(g, p, N): p_ngrams = common.get_ngram(p, 1, N) g_ngrams = common.get_ngram(g, 1, N) TP = [] FP = [] FN = [] for gn, pn in zip(g_ngrams, p_ngrams): tp, fp, fn = exact_matching(gn, pn) TP.extend(tp) FP.extend(fp) FN.extend(fn) assert len(TP + FN) == len(common.flatten(g_ngrams)) assert len(TP + FP) == len(common.flatten(p_ngrams)) return TP, common.flatten(g_ngrams), common.flatten(p_ngrams)#FP, FN
def ngram_matching(gold, pred, N): # pred, gold: list of Ngrams. gold = [x for x in gold if x] pred = [x for x in pred if x] def _matching(g, p, N): p_ngrams = common.get_ngram(p, 1, N) g_ngrams = common.get_ngram(g, 1, N) TP = [] FP = [] FN = [] for gn, pn in zip(g_ngrams, p_ngrams): tp, fp, fn = exact_matching(gn, pn) TP.extend(tp) FP.extend(fp) FN.extend(fn) assert len(TP + FN) == len(common.flatten(g_ngrams)) assert len(TP + FP) == len(common.flatten(p_ngrams)) return TP, common.flatten(g_ngrams), common.flatten(p_ngrams)#FP, FN # Example: if args.debug: pred = ['at $ __NUM__'.split(), "$ __NUM__ or".split()] gold = ["$ __NUM__ or more".split(), 'less $ __NUM__'.split(), "at least $ __NUM__".split()] f1 = np.zeros((len(gold), len(pred))) result_matrix = [[] for _ in xrange(len(gold))] for i in xrange(len(gold)): for j in xrange(len(pred)): tp, g_ngrams, p_ngrams = _matching(gold[i], pred[j], N) result_matrix[i].append(tp) prec = 1.0 * len(tp) / (len(g_ngrams)) recall = 1.0 * len(tp) / (len(p_ngrams)) f1[i][j] = 0.5 * (prec + recall) matching = linear_assignment(-f1) if args.debug: print pred print gold print for i, j in matching: print gold[i], pred[j] print common.get_ngram(gold[i], 1, N) print common.get_ngram(pred[j], 1, N) print matching exit(1) result = [] TP = common.flatten([result_matrix[i][j] for i, j in matching]) gold_ngrams = common.flatten([common.flatten(common.get_ngram(g, 1, N)) for g in gold]) pred_ngrams = common.flatten([common.flatten(common.get_ngram(p, 1, N)) for p in pred]) return TP, gold_ngrams, pred_ngrams
def extract(self, indices, lines): # When indices are provided the length of lines and indices can be different since indices (and cluster_ids) are assigned to each NUM token appearing in a line. patterns_with_scores = self.get_patterns_with_score() if not indices == None: # Align. idx_by_line = [[] for _ in xrange(len(lines))] for l_idx, t_idx in indices: idx_by_line[l_idx].append(t_idx) predictions = [] for line, idxs in zip(lines, idx_by_line): spans = common.flatten([extract_around_target(line, t_idx, patterns_with_scores) for t_idx in idxs]) spans = sorted(spans, key=lambda x:-x[1]) accepted_spans = [] for new_span, score in spans: existing_spans = [span for span, _ in accepted_spans] if common.no_overlaps(existing_spans, new_span): accepted_spans.append((new_span, score)) accepted_spans = sorted([span for span, _ in accepted_spans], key=lambda x:x[0]) exprs = spans2exprs(accepted_spans, line) predictions.append(exprs) else: predictions = [] for i, line in enumerate(lines): exprs = spans2exprs(get_ngram_matches(line, patterns_with_scores), line) predictions.append(exprs) #predictions = [spans2exprs(get_ngram_matches(line, patterns_with_scores), line) for line in lines] return predictions, None
def contain_synonym_around_num(sentence, num_indices, window_width=4): ''' sentence: List of string. (a lemmatized and tokenized sentence) num_indices: List of integer. (the indices where NUM is) ''' # TODO: remove words that can be irrelevant. # synonyms = set([ # 'amount', 'bill', 'cost', 'demand', 'discount', 'estimate', 'expenditure', 'expense', 'fare', 'fee', 'figure', 'output', 'pay', 'payment', 'premium', 'rate', 'return', 'tariff', 'valuation', 'worth', 'appraisal', 'assessment', 'barter', 'bounty', 'ceiling', 'charge', 'compensation', 'consideration', 'damage', 'disbursement', 'dues', 'exaction', 'hire', 'outlay', 'prize', 'quotation', 'ransom', 'reckoning', 'retail', 'reward', 'score', 'sticker', 'tab', 'ticket', 'toll', 'tune', 'wages', 'wholesale', 'appraisement', # ]) # synonyms = [ # 'amount', 'bill', 'cost', 'demand', 'discount', 'expenditure', # 'expense', 'fare', 'fee', 'pay', 'payment', 'premium', # 'tariff', 'valuation', 'worth', 'appraisal', 'assessment', 'barter', 'bounty', # 'ceiling', 'charge', 'compensation', 'disbursement', 'dues', # 'exaction', 'hire', 'outlay', 'prize', 'quotation', 'ransom', 'reckoning', # 'retail', 'reward', 'toll', 'tune', 'wages', 'wholesale', # 'appraisement' # ] synonyms = [ 'price', 'toll', 'cost', 'pay', 'worth', 'sell', 'charge', 'expend' ] # Whether words at the left side of NUM contain one of synonyms. # (e.g. 'cost $ 30') words = common.flatten( [sentence[max(0, idx - window_width):idx] for idx in num_indices]) return set(synonyms).intersection(set(words))
def _get_ngram(s, ngram_range): stop_words = set(['.', ',', '!', '?']) vocab_condition = lambda x: True if NUM in x and not stop_words.intersection( set(x)) else False return flatten([[ tuple(s[i:i + n]) for i in xrange(len(s) - n + 1) if vocab_condition(s[i:i + n]) ] for n in xrange(ngram_range[0], ngram_range[1] + 1)])
def main(args): window_width = 3 sents = [l.replace('\n', '').split(' ') for l in open(args.input_file)] indices_around_num = [[(max(0, i - window_width), min(len(l), i + window_width)) for i, x in enumerate(l) if x == NUM] for l in sents] words = [] for i, (idx, s) in enumerate(zip(indices_around_num, sents)): #print s, len(s) w = common.flatten([s[x[0]:x[1]] for x in idx]) words.append(w) # for idxx in idx: # print idxx, #print idxx, idx[idxx[0], idxx[1]], # print '' words = common.flatten(words) for x in sorted(Counter(words).items(), key=lambda x: -x[1])[:2000]: print x exit(1) ######################### ### Count 'tokens' around NUM sents = common.flatten(sents) for x in sorted(Counter(sents).items(), key=lambda x: -x[1])[:10000]: print x exit(1) ########################3333 vectorizer = NGramVectorizer(ngram_range=(1, 4), min_freq=5) ngrams = vectorizer.fit_transform(sents) def _get_ngram(s, ngram_range): stop_words = set(['.', ',', '!', '?']) vocab_condition = lambda x: True if NUM in x and not stop_words.intersection( set(x)) else False return flatten([[ tuple(s[i:i + n]) for i in xrange(len(s) - n + 1) if vocab_condition(s[i:i + n]) ] for n in xrange(ngram_range[0], ngram_range[1] + 1)]) ngram_range = (1, 4) ngrams = [_get_ngram(s, ngram_range) for s in sents] for ng, freq in Counter(flatten(ngrams)).most_common(10000): print ng, freq
def get_features(self, lines, input_filepath=None): return None, [ common.flatten( common.get_ngram(s, self.ngram_range[0], self.ngram_range[1], vocab_condition=self.vocab_condition)) for s in lines ]
def output_training(self, features): counts = sorted([(k, v) for k,v in collections.Counter(common.flatten(features)).items() if not self.config.min_freq or v >= self.config.min_freq], key=lambda x:-x[1]) if self.config.vocab_size: counts = counts[:self.config.vocab_size] pickle.dump(counts, open(self.vocab_path, 'wb')) with open(self.vocab_path + '.txt', 'w') as f: for k,v in counts: l = '%s\t%s' % (" ".join(k), str(v)) f.write(l)
def create_vocab(self, source, vocab_size=0): ''' Args: - source: List of words. ''' rev_vocab, freq = zip( *collections.Counter(source).most_common()) if source else ([], None) rev_vocab = common.flatten([self.tokenizer(w) for w in rev_vocab]) if rev_vocab and type(rev_vocab[0]) == list: rev_vocab = common.flatten(rev_vocab) rev_vocab = OrderedSet(self.start_vocab + list(rev_vocab)) if vocab_size: rev_vocab = OrderedSet( [w for i, w in enumerate(rev_vocab) if i < vocab_size]) vocab = collections.OrderedDict() for i, t in enumerate(rev_vocab): vocab[t] = i return vocab, rev_vocab
def preprocess(self, df): data = [] for x in df.values: d = self.preprocess_dialogue(x, context_max_len=self.context_max_len) if d: data.append(d) data = common.flatten(data) dialogues, acts, emotions, speaker_changes, topics = list(zip(*data)) contexts, responses, speaker_changes = zip(*[(d[:-1], d[-1], sc[:-1]) for d, sc in zip(dialogues, speaker_changes) if sc[-1] == True]) return contexts, responses, speaker_changes
def vec2tokens(self, vectors): tokens = [] current_dim = 0 for v in self.vectorizers: size = v.size vecs = vectors[:, current_dim:current_dim + size] current_dim += size tokens.append(v.vec2tokens(vecs)) res = [common.flatten(t) for t in zip(*tokens)] return res
def init_vocab(self, emb_configs, vocab_size=0): start_vocab = START_VOCAB # if self.tokenizer.lowercase: # start_vocab = [x.lower for x in lowercase] pretrained = [ self.load_vocab(c['path'], c['format'] == 'vec') for c in emb_configs ] rev_vocab = common.flatten([e.keys() for e in pretrained]) rev_vocab = OrderedSet( start_vocab + [self.tokenizer(w, flatten=True)[0] for w in rev_vocab]) if vocab_size: rev_vocab = OrderedSet( [w for i, w in enumerate(rev_vocab) if i < vocab_size]) vocab = collections.OrderedDict() for i, t in enumerate(rev_vocab): vocab[t] = i embeddings = [ common.flatten([emb[w] for emb in pretrained]) for w in vocab ] embeddings = np.array(embeddings) return vocab, rev_vocab, embeddings
def create_vocab(self, ngrams): min_freq = self.min_freq vocab = collections.Counter(common.flatten(ngrams)) vocab = sorted( [(v, vocab[v]) for v in vocab if not self.min_freq or vocab[v] >= self.min_freq], key=lambda x: -x[1]) if self.vocab_size: vocab = vocab[:self.vocab_size] self.vocab = [v[0] for v in vocab] self.rev_vocab = collections.OrderedDict([ (v, i) for i, v in enumerate(self.vocab) ]) self._save_vocab()
def preprocess_dialogue(self, line, context_max_len=0, split_turn=False): idx, dialogue, act, emotion, topic = line dialogue = [self.preprocess_turn(x.strip(), split_turn) for x in dialogue.split(_EOU) if x.strip()] act = [[int(a) for _ in xrange(len(d))] for a, d in zip(act.split(), dialogue)] emotion = [[int(e) for _ in xrange(len(d))] for e, d in zip(emotion.split(), dialogue)] speaker_change = [[True if i == 0 else False for i in xrange(len(d))] for d in dialogue] # Set 1 when a speaker start his/her turn, otherwise 0. dialogue = common.flatten(dialogue) act = common.flatten(act) emotion = common.flatten(emotion) speaker_change = common.flatten(speaker_change) # The length of the dialogue and its labels must be same. if len(set([len(dialogue), len(act), len(emotion)])) == 1: # The maximum length of a dialogue is context_max_len + 1 (response). dialogue_max_len = context_max_len + 1 if context_max_len else 0 if not dialogue_max_len or len(dialogue) < dialogue_max_len: return [(dialogue, act, emotion, speaker_change, topic)] else: # Slice the dialogue. res = common.flatten([[(dialogue[i:i+dlen], act[i:i+dlen], emotion[i:i+dlen], speaker_change[i:i+dlen], topic) for i in xrange(len(dialogue)+1-dlen)] for dlen in range(2, dialogue_max_len+1)]) return res else: return None
def extract(input_texts): # Deprecated # Codes for expression extraction (this is to be done after clustering?) ins_count = 0 showed_list = [] idx_expression = extract_expression(doc) if idx_expression and idx_expression not in showed_list: print "<L%d>\t" % i flattened_indice = list(set(common.flatten(idx_expression))) print 'Original sentence:\t', common.print_colored([t.text for t in doc], flattened_indice, 'red') print 'POS list :\t', common.print_colored([t.pos_ for t in doc], flattened_indice, 'blue') print 'Expressions :\t', print[(" ".join([doc[i].text for i in indices]), indices[0], indices[-1]) for indices in idx_expression] showed_list.append(idx_expression) ins_count += 1 return ins_count
def get_features(self, lines, input_filepath=None): docs = create_spacy(lines, input_filepath) indices = [] features = [] feature_f = self.subtree2str for i, d in enumerate(docs): # Get features per a NUM token in a line. # [(token_idx0, subtrees0), ...] offset = 0 feature = [] for s in d.sents: feature.append([(offset + idx, [feature_f(st) for st in sts]) for idx, sts in self.trace(s)]) offset += len(s) feature = common.flatten(feature) idx, feature = zip(*feature) if feature else ((-1, ), ([], )) indices += [(i, j) for j in idx] features += list(feature) assert len(indices) == len(features) return indices, features
def get_ngram_matches(line, feature_scores): # feature_scores: default_dict[ngram] = score ngram_length = set([len(k) for k in feature_scores.keys()]) min_n = min(ngram_length) max_n = max(ngram_length) res_spans = [] if type(line) == str: line = line.split(' ') test_sent_ngrams = common.flatten(common.get_ngram(line, min_n, max_n, vocab_condition=VOCAB_CONDITION)) possible_expr = list(set(feature_scores.keys()).intersection(test_sent_ngrams)) possible_expr = sorted([(e, feature_scores[e]) for e in possible_expr], key=lambda x:-x[1]) possible_expr = [e[0] for e in possible_expr] spans = [] for expr in possible_expr: new_spans = common.get_ngram_match(line, expr) # Check whether the newly acquired span doesn't overlaps with the span of higher priority new_spans = [ns for ns in new_spans if common.no_overlaps(spans, ns)] spans.extend(new_spans) return spans
def load_data(self): self.load = True sys.stderr.write('Loading dataset from %s ...\n' % (self.path)) df = pd.read_csv(self.path, nrows=self.max_lines) sys.stderr.write('Preprocessing ...\n') contexts, responses, speaker_changes = self.preprocess(df) if not self.wbase and not self.cbase: raise ValueError('Either \'wbase\' or \'cbase\' must be True.') self.speaker_changes = [self.sc_vocab.sent2id(sc) for sc in speaker_changes] # Separate contexts and responses into words (or chars), and convert them into their IDs. self.original = common.dotDict({}) self.symbolized = common.dotDict({}) if self.wbase: self.original.w_contexts = [[self.w_vocab.tokenizer(u) for u in context] for context in contexts] self.symbolized.w_contexts = [[self.w_vocab.sent2id(u) for u in context] for context in self.original.w_contexts] else: self.original.w_contexts = [None for context in contexts] self.symbolized.w_contexts = [None for context in contexts] if self.cbase: self.original.c_contexts = [[self.c_vocab.tokenizer(u) for u in context] for context in contexts] self.symbolized.c_contexts = [[self.c_vocab.sent2id(u) for u in context] for context in self.original.c_contexts] else: self.original.c_contexts = [None for context in contexts] self.symbolized.c_contexts = [None for context in contexts] self.original.responses = [self.w_vocab.tokenizer(r) for r in responses] self.symbolized.responses = [self.w_vocab.sent2id(r) for r in responses] responses = self.symbolized.responses w_contexts = self.symbolized.w_contexts self.texts = common.flatten(w_contexts) + list(responses)
def init_vocab(self, emb_configs, vocab_size=0): # Combine specified pre-trained embeddings. pretrained = [ self.load_vocab(c['path'], skip_first=c['skip_first'], vocab_size=vocab_size) for c in emb_configs ] rev_vocab = common.flatten([e.keys() for e in pretrained]) rev_vocab = OrderedSet(self.start_vocab + list(rev_vocab)) if vocab_size: rev_vocab = OrderedSet([ w for i, w in enumerate(rev_vocab) if i < vocab_size + len(self.start_vocab) ]) vocab = collections.OrderedDict() for i, t in enumerate(rev_vocab): vocab[t] = i #embeddings = [common.flatten([emb[w] for emb in pretrained]) for w in vocab] embeddings = [np.array([emb[w] for w in vocab]) for emb in pretrained] embeddings = np.concatenate(embeddings, axis=-1) return vocab, rev_vocab, embeddings
def get_words(self, train_data_path): df = pd.read_csv(train_data_path) data = self.dataset_type.preprocess(df, context_max_len=0) dialogues, _, _, _ = list(zip(*data)) words = common.flatten([utterance.split() for utterance in common.flatten(dialogues)]) return words
def contain_currency_symbol_around_num(sentence, num_indices, window_width=4): words = common.flatten( [sentence[max(0, idx - window_width):idx] for idx in num_indices]) return set(c_symbols).intersection(set(words))
def contain_currency_name_around_num(sentence, num_indices, window_width=4): # Whether words at the right side of NUM contain one of synonyms. # (e.g. '30 dollars') words = common.flatten( [sentence[idx + 1:idx + 1 + window_width] for idx in num_indices]) return set(c_names).intersection(set(words))
def separate_concatenated_tokens(tokens): return common.flatten([x.split('|') for x in tokens])
def create_vocab(self, vocab_path, texts, vocab_size=0): texts = common.flatten([self.tokenizer.word2chars(word) for word in texts]) return WordVocabulary.create_vocab(self, vocab_path, texts, vocab_size=vocab_size)
def _get_weighted_frequency(feats): scores = collections.defaultdict(int) for k, v in common.flatten(feats): scores[k] += v return sorted([(k, v*len(k)) for k,v in scores.items()], key=lambda x: -x[1])