示例#1
0
def load_set(fname, emb, cache_dir=None):
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(
            fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache = True

    s0, s1, y, _, _, _ = loader.load_anssel(fname)
    e0, e1, s0, s1, y = loader.load_embedded(emb,
                                             s0,
                                             s1,
                                             y,
                                             balance=True,
                                             ndim=1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((e0, e1, y), f)
    return (e0, e1, y)
示例#2
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
示例#3
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True):
    """ Caching: If cache_dir is set: it tries to load finished dataset from it 
        (filename of cache is hash of fname), and if that fails, it will compute 
        dataset and try to save it."""
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache=True

    s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((s0, s1, y, vocab, gr), f)
            print("save")

    return (s0, s1, y, vocab, gr)
示例#4
0
def load_set(glove, fname, balance=False, subsample0=3):
    s0, s1, labels, toklabels = loader.load_anssel(fname,
                                                   subsample0=subsample0)
    print('(%s) Loaded dataset: %d' % (fname, len(s0)))
    e0, e1, s0, s1, labels = loader.load_embedded(glove,
                                                  s0,
                                                  s1,
                                                  labels,
                                                  balance=balance)
    return ([e0, e1], labels)
示例#5
0
def load_set(fname, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
示例#6
0
def load_set(fname, vocab=None):
    s0, s1, y, t = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
示例#7
0
def load_set(fname, vocab=None):
    s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=False)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return s0, s1, y, vocab, gr
示例#8
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad):
    s0, s1, y, t = loader.load_anssel(fname)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
示例#9
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname,
                                                   skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0,
                                si1,
                                sj0,
                                sj1,
                                None,
                                None,
                                y,
                                f0,
                                f1,
                                s0,
                                s1,
                                kw=kw,
                                akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
示例#10
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
示例#11
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
示例#12
0
def load_set(fname, emb, cache_dir=None):
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache=True

    s0, s1, y, t = loader.load_anssel(fname)
    e0, e1, s0, s1, y = loader.load_embedded(emb, s0, s1, y, balance=True, ndim=1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((e0, e1, y), f)
    return (e0, e1, y)
示例#13
0
def load_set(glove, fname, balance=False, subsample0=3):
    s0, s1, labels = loader.load_anssel(fname, subsample0=subsample0)
    print('(%s) Loaded dataset: %d' % (fname, len(s0)))
    e0, e1, s0, s1, labels = loader.load_embedded(glove, s0, s1, labels, balance=balance)
    return ([e0, e1], labels)