def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) # FIXME: lower? else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True): """ Caching: If cache_dir is set: it tries to load finished dataset from it (filename of cache is hash of fname), and if that fails, it will compute dataset and try to save it.""" save_cache = False if cache_dir: fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache=True s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (s0, s1, y, vocab, gr)
def load_set(files, vocab=None, skip_unlabeled=True, spad=spad): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) try: strtype = basestring except NameError: strtype = str if isinstance(files, strtype): s0, s1, y = load_file(files, skip_unlabeled=skip_unlabeled) else: s0, s1, y = loader.concat_datasets( [load_file(d, skip_unlabeled=skip_unlabeled) for d in files]) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=spad) si1 = vocab.vectorize(s1, spad=spad) f0, f1 = nlp.sentence_flags(s0, s1, spad, spad) gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1) return (s0, s1, y, vocab, gr)
def load_set(fname, vocab): s0, s1, y = loader.load_snli(fname, vocab) si0 = vocab.vectorize(s0, spad=spad) si1 = vocab.vectorize(s1, spad=spad) f0, f1 = nlp.sentence_flags(s0, s1, spad, spad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (si0, si1, y, gr)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True skip_oneclass = self.c.get('skip_oneclass', True) s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(dsfile, vocab): s0i = [] s1i = [] f0 = [] f1 = [] labels = [] i = 0 with open(dsfile) as f: c = csv.reader(f, delimiter=',') for qtext, atext, label in c: if i % 10000 == 0: print('%d samples' % (i,)) try: qtext = qtext.decode('utf8') atext = atext.decode('utf8') except AttributeError: # python3 has no .decode() qtext = qtext atext = atext s0 = qtext.replace('</s>', '__EOS__').split(' ') s1 = atext.replace('</s>', '__EOS__').split(' ') si0 = vocab.vectorize([s0], spad=None) si1 = vocab.vectorize([s1], spad=None) f0_, f1_ = nlp.sentence_flags([s0], [s1], len(s0), len(s1)) s0i.append(si0[0]) s1i.append(si1[0]) f0.append(f0_[0]) f1.append(f1_[0]) labels.append(int(label)) i += 1 if i > MAX_SAMPLES: break return (s0i, s1i, f0, f1, labels)
def load_set(dsfile, vocab): s0i = [] s1i = [] f0 = [] f1 = [] labels = [] i = 0 with open(dsfile) as f: c = csv.reader(f, delimiter=',') for qtext, atext, label in c: if i % 10000 == 0: print('%d samples' % (i, )) try: qtext = qtext.decode('utf8') atext = atext.decode('utf8') except AttributeError: # python3 has no .decode() qtext = qtext atext = atext s0 = qtext.replace('</s>', '__EOS__').split(' ') s1 = atext.replace('</s>', '__EOS__').split(' ') si0 = vocab.vectorize([s0], spad=None) si1 = vocab.vectorize([s1], spad=None) f0_, f1_ = nlp.sentence_flags([s0], [s1], len(s0), len(s1)) s0i.append(si0[0]) s1i.append(si1[0]) f0.append(f0_[0]) f1.append(f1_[0]) labels.append(int(label)) i += 1 if i > MAX_SAMPLES: break return (s0i, s1i, f0, f1, labels)
def load_set(fname, vocab=None): s0, s1, y, t = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0) si1 = vocab.vectorize(s1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_set(fname, vocab=None): s0, s1, y, _, _, _ = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0) si1 = vocab.vectorize(s1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_sent(q, a, vocab=None): s0, s1, y = [q], [a], 1 # s0=questions, s1=answers if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return gr
def load_set(fname, vocab=None): s0, s1, y = loader.load_hypev(fname) # s0=questions, s1=answers if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return s0, s1, y, vocab, gr
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad): s0, s1, y, t = loader.load_anssel(fname) # TODO: Make use of the t-annotations if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (s0, s1, y, vocab, gr)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True skip_oneclass = self.c.get('skip_oneclass', True) s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1, kw=kw, akw=akw) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(self, fname): s0, s1, y = loader.load_msrpara(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname): s0, s1, y = loader.load_sick2014(fname, mode='entailment') if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname, cache_dir=None): s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def links_to_graph(self, links): s0 = [] s1 = [] labels = [] for link in links: s0l, s1l, labelsl = self.link_to_s(link) s0 += s0l s1 += s1l labels += labelsl si0 = self.vocab.vectorize(s0, spad=self.s0pad) si1 = self.vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1) return gr
def load_set(fname, emb, vocab=None): s0, s1, y, _, _, _ = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0, sj0 = vocab.vectorize(s0, emb) si1, sj1 = vocab.vectorize(s1, emb) se0 = emb.map_jset(sj0) se1 = emb.map_jset(sj1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1) # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint # for the dataset. In KeraSTS, we solve this by using fit_generator (also # because of epoch_fract) and embed just per-batch. return (s0, s1, y, vocab, gr)
def load_set(files, vocab=None, skip_unlabeled=True): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) s0, s1, y = loader.concat_datasets([load_file(d, skip_unlabeled=skip_unlabeled) for d in files]) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=spad) si1 = vocab.vectorize(s1, spad=spad) f0, f1 = nlp.sentence_flags(s0, s1, spad, spad) gr = graph_input_sts(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_set(self, fname): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) s0, s1, y = load_file(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) s0, s1, y = load_file(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_sts(si0, si1, sj0, sj1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname, lists=None): if lists: s0, s1, y = lists else: # s0, s1, y = loader.load_msrpara(fname) #set it free is we decide not to use quora dataset s0, s1, y = loader.load_quora(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_nparray_anssel( graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1)) return (gr, y, vocab)
def load_set(self, fname, cache_dir=None, lists=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True if lists is not None: s0, s1, y, qids, xtra, types = lists else: xtra = None if '/mc' in fname: s0, s1, y, qids, types = loader.load_mctest(fname) else: s0, s1, y, qids = loader.load_hypev(fname) try: dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname) # train.tsv -> train_aux.tsv with open(dsfile) as f: rows = csv.DictReader(f, delimiter='\t') xtra = loader.load_hypev_xtra(rows) print(dsfile + ' loaded and available') except Exception as e: if self.c['aux_r'] or self.c['aux_c']: raise e types = None if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab # mcqtypes pruning must happen *after* Vocabulary has been constructed! if types is not None: s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']] s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']] y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']] qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']] print('Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes'])) si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) if qids is not None: gr['qids'] = qids if xtra is not None: gr['#'] = xtra['#'] gr['@'] = xtra['@'] gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname,vocab): s0, s1, labels = loader.load_snli(fname, vocab) si0 = vocab.vectorize(s0, spad) si1 = vocab.vectorize(s1, spad) f0, f1 = nlp.sentence_flags(s0, s1, spad, spad) return (si0, si1, f0, f1, labels)
def load_set(fname,vocab,glove): s0, s1, labels = loader.load_snli(fname, vocab) si0,sj0 = vocab.vectorize(s0, glove, spad) si1,sj1 = vocab.vectorize(s1, glove, spad) f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad) return (si0, si1, sj0, sj1, f0_, f1_, labels)
def load_set(self, fname, cache_dir=None, lists=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True if lists is not None: s0, s1, y, qids, xtra, types = lists else: xtra = None if '/mc' in fname: s0, s1, y, qids, types = loader.load_mctest(fname) else: s0, s1, y, qids = loader.load_hypev(fname) try: dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname) # train.tsv -> train_aux.tsv with open(dsfile) as f: rows = csv.DictReader(f, delimiter='\t') xtra = loader.load_hypev_xtra(rows) print(dsfile + ' loaded and available') except Exception as e: if self.c['aux_r'] or self.c['aux_c']: raise e types = None if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab # mcqtypes pruning must happen *after* Vocabulary has been constructed! if types is not None: s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']] s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']] y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']] qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']] print( 'Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes'])) si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) if qids is not None: gr['qids'] = qids if xtra is not None: gr['#'] = xtra['#'] gr['@'] = xtra['@'] gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, vocab, glove): s0, s1, labels = loader.load_snli(fname, vocab) si0, sj0 = vocab.vectorize(s0, glove, spad) si1, sj1 = vocab.vectorize(s1, glove, spad) f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad) return (si0, si1, sj0, sj1, f0_, f1_, labels)