def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) # FIXME: lower? else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) # FIXME: lower? else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, vocab=None): s0, s1, y = loader.load_hypev(fname) # s0=questions, s1=answers if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return s0, s1, y, vocab, gr
def load_set(self, fname, cache_dir=None): s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname, cache_dir=None, lists=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True if lists is not None: s0, s1, y, qids, xtra, types = lists else: xtra = None if '/mc' in fname: s0, s1, y, qids, types = loader.load_mctest(fname) else: s0, s1, y, qids = loader.load_hypev(fname) try: dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname) # train.tsv -> train_aux.tsv with open(dsfile) as f: rows = csv.DictReader(f, delimiter='\t') xtra = loader.load_hypev_xtra(rows) print(dsfile + ' loaded and available') except Exception as e: if self.c['aux_r'] or self.c['aux_c']: raise e types = None if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab # mcqtypes pruning must happen *after* Vocabulary has been constructed! if types is not None: s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']] s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']] y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']] qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']] print( 'Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes'])) si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) if qids is not None: gr['qids'] = qids if xtra is not None: gr['#'] = xtra['#'] gr['@'] = xtra['@'] gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(self, fname, cache_dir=None, lists=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True if lists is not None: s0, s1, y, qids, xtra, types = lists else: xtra = None if '/mc' in fname: s0, s1, y, qids, types = loader.load_mctest(fname) else: s0, s1, y, qids = loader.load_hypev(fname) try: dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname) # train.tsv -> train_aux.tsv with open(dsfile) as f: rows = csv.DictReader(f, delimiter='\t') xtra = loader.load_hypev_xtra(rows) print(dsfile + ' loaded and available') except Exception as e: if self.c['aux_r'] or self.c['aux_c']: raise e types = None if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab # mcqtypes pruning must happen *after* Vocabulary has been constructed! if types is not None: s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']] s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']] y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']] qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']] print('Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes'])) si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) if qids is not None: gr['qids'] = qids if xtra is not None: gr['#'] = xtra['#'] gr['@'] = xtra['@'] gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)