Пример #1
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)  # FIXME: lower?
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Пример #2
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True):
    """ Caching: If cache_dir is set: it tries to load finished dataset from it 
        (filename of cache is hash of fname), and if that fails, it will compute 
        dataset and try to save it."""
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache=True

    s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((s0, s1, y, vocab, gr), f)
            print("save")

    return (s0, s1, y, vocab, gr)
Пример #3
0
def load_set(files, vocab=None, skip_unlabeled=True, spad=spad):
    def load_file(fname, skip_unlabeled=True):
        # XXX: ugly logic
        if 'sick2014' in fname:
            return loader.load_sick2014(fname)
        else:
            return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)

    try:
        strtype = basestring
    except NameError:
        strtype = str
    if isinstance(files, strtype):
        s0, s1, y = load_file(files, skip_unlabeled=skip_unlabeled)
    else:
        s0, s1, y = loader.concat_datasets(
            [load_file(d, skip_unlabeled=skip_unlabeled) for d in files])

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
Пример #4
0
def load_set(fname, vocab):
    s0, s1, y = loader.load_snli(fname, vocab)
    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)
    return (si0, si1, y, gr)
Пример #5
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Пример #6
0
def load_set(dsfile, vocab):
    s0i = []
    s1i = []
    f0 = []
    f1 = []
    labels = []

    i = 0
    with open(dsfile) as f:
        c = csv.reader(f, delimiter=',')
        for qtext, atext, label in c:
            if i % 10000 == 0:
                print('%d samples' % (i,))
            try:
                qtext = qtext.decode('utf8')
                atext = atext.decode('utf8')
            except AttributeError:  # python3 has no .decode()
                qtext = qtext
                atext = atext
            s0 = qtext.replace('</s>', '__EOS__').split(' ')
            s1 = atext.replace('</s>', '__EOS__').split(' ')
            si0 = vocab.vectorize([s0], spad=None)
            si1 = vocab.vectorize([s1], spad=None)
            f0_, f1_ = nlp.sentence_flags([s0], [s1], len(s0), len(s1))

            s0i.append(si0[0])
            s1i.append(si1[0])
            f0.append(f0_[0])
            f1.append(f1_[0])
            labels.append(int(label))
            i += 1
            if i > MAX_SAMPLES:
                break

    return (s0i, s1i, f0, f1, labels)
Пример #7
0
def load_set(dsfile, vocab):
    s0i = []
    s1i = []
    f0 = []
    f1 = []
    labels = []

    i = 0
    with open(dsfile) as f:
        c = csv.reader(f, delimiter=',')
        for qtext, atext, label in c:
            if i % 10000 == 0:
                print('%d samples' % (i, ))
            try:
                qtext = qtext.decode('utf8')
                atext = atext.decode('utf8')
            except AttributeError:  # python3 has no .decode()
                qtext = qtext
                atext = atext
            s0 = qtext.replace('</s>', '__EOS__').split(' ')
            s1 = atext.replace('</s>', '__EOS__').split(' ')
            si0 = vocab.vectorize([s0], spad=None)
            si1 = vocab.vectorize([s1], spad=None)
            f0_, f1_ = nlp.sentence_flags([s0], [s1], len(s0), len(s1))

            s0i.append(si0[0])
            s1i.append(si1[0])
            f0.append(f0_[0])
            f1.append(f1_[0])
            labels.append(int(label))
            i += 1
            if i > MAX_SAMPLES:
                break

    return (s0i, s1i, f0, f1, labels)
Пример #8
0
def load_set(fname, vocab=None):
    s0, s1, y, t = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
Пример #9
0
def load_set(fname, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
Пример #10
0
def load_sent(q, a, vocab=None):
    s0, s1, y = [q], [a], 1
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return gr
Пример #11
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
Пример #12
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
Пример #13
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad):
    s0, s1, y, t = loader.load_anssel(fname)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
Пример #14
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname,
                                                   skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0,
                                si1,
                                sj0,
                                sj1,
                                None,
                                None,
                                y,
                                f0,
                                f1,
                                s0,
                                s1,
                                kw=kw,
                                akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Пример #15
0
    def load_set(self, fname):
        s0, s1, y = loader.load_msrpara(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Пример #16
0
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Пример #17
0
    def load_set(self, fname, cache_dir=None):
        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Пример #18
0
    def links_to_graph(self, links):
        s0 = []
        s1 = []
        labels = []
        for link in links:
            s0l, s1l, labelsl = self.link_to_s(link)
            s0 += s0l
            s1 += s1l
            labels += labelsl

        si0 = self.vocab.vectorize(s0, spad=self.s0pad)
        si1 = self.vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)

        gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1)
        return gr
Пример #19
0
    def links_to_graph(self, links):
        s0 = []
        s1 = []
        labels = []
        for link in links:
            s0l, s1l, labelsl = self.link_to_s(link)
            s0 += s0l
            s1 += s1l
            labels += labelsl

        si0 = self.vocab.vectorize(s0, spad=self.s0pad)
        si1 = self.vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)

        gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1)
        return gr
Пример #20
0
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)

        return (gr, y, vocab)
Пример #21
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
Пример #22
0
def load_set(files, vocab=None, skip_unlabeled=True):
    def load_file(fname, skip_unlabeled=True):
        # XXX: ugly logic
        if 'sick2014' in fname:
            return loader.load_sick2014(fname)
        else:
            return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
    s0, s1, y = loader.concat_datasets([load_file(d, skip_unlabeled=skip_unlabeled) for d in files])

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_sts(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
Пример #23
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
Пример #24
0
    def load_set(self, fname):
        def load_file(fname, skip_unlabeled=True):
            # XXX: ugly logic
            if 'sick2014' in fname:
                return loader.load_sick2014(fname)
            else:
                return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
        s0, s1, y = load_file(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Пример #25
0
    def load_set(self, fname):
        def load_file(fname, skip_unlabeled=True):
            # XXX: ugly logic
            if 'sick2014' in fname:
                return loader.load_sick2014(fname)
            else:
                return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
        s0, s1, y = load_file(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_sts(si0, si1, sj0, sj1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Пример #26
0
    def load_set(self, fname, lists=None):
        if lists:
            s0, s1, y = lists
        else:
            #            s0, s1, y = loader.load_msrpara(fname)   #set it free is we decide not to use quora dataset
            s0, s1, y = loader.load_quora(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_nparray_anssel(
            graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1))

        return (gr, y, vocab)
Пример #27
0
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print('Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Пример #28
0
def load_set(fname,vocab):
    s0, s1, labels = loader.load_snli(fname, vocab)
    si0 = vocab.vectorize(s0, spad)
    si1 = vocab.vectorize(s1, spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    return (si0, si1, f0, f1, labels)
Пример #29
0
def load_set(fname,vocab,glove):
    s0, s1, labels = loader.load_snli(fname, vocab)
    si0,sj0 = vocab.vectorize(s0, glove, spad)
    si1,sj1 = vocab.vectorize(s1, glove, spad)
    f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad)
    return (si0, si1, sj0, sj1, f0_, f1_, labels)
Пример #30
0
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv',
                                    fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print(
                'Retained %d questions, %d hypotheses (%s types)' %
                (len(set(qids)), len(set([' '.join(s)
                                          for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Пример #31
0
def load_set(fname, vocab, glove):
    s0, s1, labels = loader.load_snli(fname, vocab)
    si0, sj0 = vocab.vectorize(s0, glove, spad)
    si1, sj1 = vocab.vectorize(s1, glove, spad)
    f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad)
    return (si0, si1, sj0, sj1, f0_, f1_, labels)