Exemplo n.º 1
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)  # FIXME: lower?
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Exemplo n.º 2
0
def load_set(files, vocab=None, skip_unlabeled=True, spad=spad):
    def load_file(fname, skip_unlabeled=True):
        # XXX: ugly logic
        if 'sick2014' in fname:
            return loader.load_sick2014(fname)
        else:
            return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)

    try:
        strtype = basestring
    except NameError:
        strtype = str
    if isinstance(files, strtype):
        s0, s1, y = load_file(files, skip_unlabeled=skip_unlabeled)
    else:
        s0, s1, y = loader.concat_datasets(
            [load_file(d, skip_unlabeled=skip_unlabeled) for d in files])

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
Exemplo n.º 3
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True):
    """ Caching: If cache_dir is set: it tries to load finished dataset from it 
        (filename of cache is hash of fname), and if that fails, it will compute 
        dataset and try to save it."""
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache=True

    s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((s0, s1, y, vocab, gr), f)
            print("save")

    return (s0, s1, y, vocab, gr)
Exemplo n.º 4
0
def load_set(fname, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
Exemplo n.º 5
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
Exemplo n.º 6
0
def load_sent(q, a, vocab=None):
    s0, s1, y = [q], [a], 1
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return gr
Exemplo n.º 7
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname,
                                                   skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0,
                                si1,
                                sj0,
                                sj1,
                                None,
                                None,
                                y,
                                f0,
                                f1,
                                s0,
                                s1,
                                kw=kw,
                                akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Exemplo n.º 8
0
    def load_set(self, fname):
        s0, s1, y = loader.load_msrpara(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Exemplo n.º 9
0
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)

        return (gr, y, vocab)
Exemplo n.º 10
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
Exemplo n.º 11
0
    def load_set(self, fname):
        def load_file(fname, skip_unlabeled=True):
            # XXX: ugly logic
            if 'sick2014' in fname:
                return loader.load_sick2014(fname)
            else:
                return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
        s0, s1, y = load_file(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Exemplo n.º 12
0
    def load_set(self, fname, lists=None):
        if lists:
            s0, s1, y = lists
        else:
            #            s0, s1, y = loader.load_msrpara(fname)   #set it free is we decide not to use quora dataset
            s0, s1, y = loader.load_quora(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_nparray_anssel(
            graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1))

        return (gr, y, vocab)
Exemplo n.º 13
0
 def load_vocab(self, vocabf):
     self.texts = loader.load_askubuntu_texts(vocabf)
     self.vocab = Vocabulary(self.texts.values(),
                             prune_N=self.c['embprune'],
                             icase=self.c['embicase'])
     return self.vocab
Exemplo n.º 14
0
            labels.append(int(label))
            i += 1
            if i > MAX_SAMPLES:
                break

    return (s0i, s1i, f0, f1, labels)


if __name__ == "__main__":
    args = sys.argv[1:]
    if args[0] == '--revocab':
        revocab = True
        args = args[1:]
    else:
        revocab = False

    dataf, dumpf, vocabf = args

    if revocab:
        vocab = Vocabulary(sentence_gen(dataf), count_thres=2)
        print('%d words' % (len(vocab.word_idx)))
        pickle.dump(vocab, open(vocabf, "wb"))
    else:
        vocab = pickle.load(open(vocabf, "rb"))
        print('%d words' % (len(vocab.word_idx)))

    s0i, s1i, f0, f1, labels = load_set(dataf, vocab)
    pickle.dump((s0i, s1i, f0, f1, labels), open(dumpf, "wb"))

    # glove = emb.GloVe(N=300)
Exemplo n.º 15
0
 def load_vocab(self, vocabf):
     self.texts = loader.load_askubuntu_texts(vocabf)
     self.vocab = Vocabulary(self.texts.values())
     return self.vocab
Exemplo n.º 16
0
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv',
                                    fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print(
                'Retained %d questions, %d hypotheses (%s types)' %
                (len(set(qids)), len(set([' '.join(s)
                                          for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Exemplo n.º 17
0
    f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad)
    return (si0, si1, sj0, sj1, f0_, f1_, labels)


if __name__ == "__main__":
    args = sys.argv[1:]
    if args[0] == '--revocab':
        revocab = True
        args = args[1:]
    else:
        revocab = False

    trainf, valf, testf, dumptrainf, dumpvalf, dumptestf, vocabf = args

    if revocab:
        vocab = Vocabulary(sentence_gen([trainf]), count_thres=2)
        print('%d words' % (len(vocab.word_idx)))
        pickle.dump(vocab, open(vocabf, "wb"))
    else:
        vocab = pickle.load(open(vocabf, "rb"))
        print('%d words' % (len(vocab.word_idx)))

    glove = emb.GloVe(N=300)  # XXX: hardcoded

    print('Preprocessing train file')
    si0, si1, sj0, sj1, f0_, f1_, labels = load_set(trainf, vocab, glove)
    pickle.dump((si0, si1, sj0, sj1, f0_, f1_, labels), open(dumptrainf, "wb"))

    print('Preprocessing validation file')
    si0, si1, sj0, sj1, f0_, f1_, labels = load_set(valf, vocab, glove)
    pickle.dump((si0, si1, sj0, sj1, f0_, f1_, labels), open(dumpvalf, "wb"))
Exemplo n.º 18
0
            if i > MAX_SAMPLES:
                break

    return (s0i, s1i, s0j, s1j, f0, f1, labels)


if __name__ == "__main__":
    args = sys.argv[1:]
    if args[0] == '--revocab':
        revocab = True
        args = args[1:]
    else:
        revocab = False

    dataf, dumpf, vocabf = args

    glove = emb.GloVe(N=300)  # XXX: hardcoded

    if revocab:
        vocab = Vocabulary(sentence_gen(dataf), count_thres=2, prune_N=100)
        print('%d words' % (len(vocab.word_idx)))
        pickle.dump(vocab, open(vocabf, "wb"))
    else:
        vocab = pickle.load(open(vocabf, "rb"))
        print('%d words' % (len(vocab.word_idx)))

    s0i, s1i, s0j, s1j, f0, f1, labels = load_set(dataf, vocab, glove)
    pickle.dump((s0i, s1i, s0j, s1j, f0, f1, labels), open(dumpf, "wb"))

    # glove = emb.GloVe(N=300)