예제 #1
0
def load_set(files, vocab=None, skip_unlabeled=True, spad=spad):
    def load_file(fname, skip_unlabeled=True):
        # XXX: ugly logic
        if 'sick2014' in fname:
            return loader.load_sick2014(fname)
        else:
            return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)

    try:
        strtype = basestring
    except NameError:
        strtype = str
    if isinstance(files, strtype):
        s0, s1, y = load_file(files, skip_unlabeled=skip_unlabeled)
    else:
        s0, s1, y = loader.concat_datasets(
            [load_file(d, skip_unlabeled=skip_unlabeled) for d in files])

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
예제 #2
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)  # FIXME: lower?
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
예제 #3
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True):
    """ Caching: If cache_dir is set: it tries to load finished dataset from it 
        (filename of cache is hash of fname), and if that fails, it will compute 
        dataset and try to save it."""
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache=True

    s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((s0, s1, y, vocab, gr), f)
            print("save")

    return (s0, s1, y, vocab, gr)
예제 #4
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
예제 #5
0
def load_set(fname, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
예제 #6
0
def load_set(fname, vocab=None):
    s0, s1, y, t = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
예제 #7
0
def load_sent(q, a, vocab=None):
    s0, s1, y = [q], [a], 1
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return gr
예제 #8
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad):
    s0, s1, y, t = loader.load_anssel(fname)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
예제 #9
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
예제 #10
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
예제 #11
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname,
                                                   skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0,
                                si1,
                                sj0,
                                sj1,
                                None,
                                None,
                                y,
                                f0,
                                f1,
                                s0,
                                s1,
                                kw=kw,
                                akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
예제 #12
0
파일: rte.py 프로젝트: brmson/dataset-sts
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1)

        return (gr, y, vocab)
예제 #13
0
    def load_set(self, fname, cache_dir=None):
        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
예제 #14
0
    def load_set(self, fname):
        s0, s1, y = loader.load_msrpara(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
예제 #15
0
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)

        return (gr, y, vocab)
예제 #16
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
예제 #17
0
def load_set(files, vocab=None, skip_unlabeled=True):
    def load_file(fname, skip_unlabeled=True):
        # XXX: ugly logic
        if 'sick2014' in fname:
            return loader.load_sick2014(fname)
        else:
            return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
    s0, s1, y = loader.concat_datasets([load_file(d, skip_unlabeled=skip_unlabeled) for d in files])

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_sts(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
예제 #18
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
예제 #19
0
    def load_set(self, fname):
        def load_file(fname, skip_unlabeled=True):
            # XXX: ugly logic
            if 'sick2014' in fname:
                return loader.load_sick2014(fname)
            else:
                return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
        s0, s1, y = load_file(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
예제 #20
0
파일: sts.py 프로젝트: brmson/dataset-sts
    def load_set(self, fname):
        def load_file(fname, skip_unlabeled=True):
            # XXX: ugly logic
            if 'sick2014' in fname:
                return loader.load_sick2014(fname)
            else:
                return loader.load_sts(fname, skip_unlabeled=skip_unlabeled)
        s0, s1, y = load_file(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_sts(si0, si1, sj0, sj1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
예제 #21
0
파일: para.py 프로젝트: lizihan021/Hotpot
    def load_set(self, fname, lists=None):
        if lists:
            s0, s1, y = lists
        else:
            #            s0, s1, y = loader.load_msrpara(fname)   #set it free is we decide not to use quora dataset
            s0, s1, y = loader.load_quora(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_nparray_anssel(
            graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1))

        return (gr, y, vocab)
예제 #22
0
 def load_vocab(self, vocabf):
     self.texts = loader.load_askubuntu_texts(vocabf)
     self.vocab = Vocabulary(self.texts.values())
     return self.vocab
예제 #23
0
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv',
                                    fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print(
                'Retained %d questions, %d hypotheses (%s types)' %
                (len(set(qids)), len(set([' '.join(s)
                                          for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
예제 #24
0
    f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad)
    return (si0, si1, sj0, sj1, f0_, f1_, labels)


if __name__ == "__main__":
    args = sys.argv[1:]
    if args[0] == '--revocab':
        revocab = True
        args = args[1:]
    else:
        revocab = False

    trainf, valf, testf, dumptrainf, dumpvalf, dumptestf, vocabf = args

    if revocab:
        vocab = Vocabulary(sentence_gen([trainf]), count_thres=2)
        print('%d words' % (len(vocab.word_idx)))
        pickle.dump(vocab, open(vocabf, "wb"))
    else:
        vocab = pickle.load(open(vocabf, "rb"))
        print('%d words' % (len(vocab.word_idx)))

    glove = emb.GloVe(N=300)  # XXX: hardcoded

    print('Preprocessing train file')
    si0, si1, sj0, sj1, f0_, f1_, labels = load_set(trainf, vocab, glove)
    pickle.dump((si0, si1, sj0, sj1, f0_, f1_, labels), open(dumptrainf, "wb"))

    print('Preprocessing validation file')
    si0, si1, sj0, sj1, f0_, f1_, labels = load_set(valf, vocab, glove)
    pickle.dump((si0, si1, sj0, sj1, f0_, f1_, labels), open(dumpvalf, "wb"))
예제 #25
0
            if i > MAX_SAMPLES:
                break

    return (s0i, s1i, s0j, s1j, f0, f1, labels)


if __name__ == "__main__":
    args = sys.argv[1:]
    if args[0] == '--revocab':
        revocab = True
        args = args[1:]
    else:
        revocab = False

    dataf, dumpf, vocabf = args

    glove = emb.GloVe(N=300)  # XXX: hardcoded

    if revocab:
        vocab = Vocabulary(sentence_gen(dataf), count_thres=2, prune_N=100)
        print('%d words' % (len(vocab.word_idx)))
        pickle.dump(vocab, open(vocabf, "wb"))
    else:
        vocab = pickle.load(open(vocabf, "rb"))
        print('%d words' % (len(vocab.word_idx)))

    s0i, s1i, s0j, s1j, f0, f1, labels = load_set(dataf, vocab, glove)
    pickle.dump((s0i, s1i, s0j, s1j, f0, f1, labels), open(dumpf, "wb"))

    # glove = emb.GloVe(N=300)
예제 #26
0
 def load_vocab(self, vocabf):
     self.texts = loader.load_askubuntu_texts(vocabf)
     self.vocab = Vocabulary(self.texts.values(),
                             prune_N=self.c['embprune'],
                             icase=self.c['embicase'])
     return self.vocab
예제 #27
0
class AskUTask(ParaphrasingTask):
    def __init__(self):
        self.name = 'asku'
        self.s0pad = 60
        self.s1pad = 60
        self.emb = None
        self.vocab = None

    def config(self, c):
        c['loss'] = ranknet
        c['nb_epoch'] = 16
        c['batch_size'] = 192
        c['epoch_fract'] = 1 / 4

    def load_vocab(self, vocabf):
        self.texts = loader.load_askubuntu_texts(vocabf)
        self.vocab = Vocabulary(self.texts.values(),
                                prune_N=self.c['embprune'],
                                icase=self.c['embicase'])
        return self.vocab

    def load_set(self, fname, cache_dir=None):
        links = loader.load_askubuntu_q(fname)
        return links

    def link_to_s(self, link):
        # convert link in the askubuntu_q format to a set of sentence pairs
        pid, qids, qlabels = link
        s0 = []
        s1 = []
        labels = []
        for qid, ql in zip(qids, qlabels):
            s0.append(self.texts[pid])
            s1.append(self.texts[qid])
            labels.append(ql)
        return s0, s1, labels

    def links_to_graph(self, links):
        s0 = []
        s1 = []
        labels = []
        for link in links:
            s0l, s1l, labelsl = self.link_to_s(link)
            s0 += s0l
            s1 += s1l
            labels += labelsl

        si0, sj0 = self.vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = self.vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)

        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None,
                                np.array(labels), f0, f1, s0, s1)
        return gr

    def load_data(self, trainf, valf, testf=None):
        self.trainf = trainf
        self.valf = valf
        self.testf = testf

        if self.vocab is None:
            # XXX: this vocab includes even val,test words!
            self.load_vocab(os.path.dirname(trainf) + '/text_tokenized.txt.gz')

        self.links = self.load_set(trainf)
        from itertools import chain
        self.gr = {
            'score': list(chain.from_iterable([l[2] for l in self.links]))
        }
        print('Training set: %d links, %d sentence pairs' %
              (len(self.links), len(self.gr['score'])))
        self.linksv = self.load_set(valf)
        self.grv = self.links_to_graph(self.linksv)
        if testf is not None:
            self.linkst = self.load_set(testf)
            self.grt = self.links_to_graph(self.linksv)
        else:
            self.linkst = None

    def sample_pairs(self, batch_size, once=False):
        """ A generator that produces random pairs from the dataset """
        ids = range(len(self.links))
        while True:
            random.shuffle(ids)
            links_to_yield = []
            n_yielded = 0
            for i in ids:
                link = self.links[i]
                links_to_yield.append(link)
                n_yielded += len(link[1])

                if n_yielded < batch_size:
                    continue

                # we have accumulated enough pairs, produce a graph
                ogr = self.links_to_graph(links_to_yield)
                links_to_yield = []
                n_yielded = 0
                yield ogr
            if once:
                break

    def fit_callbacks(self, weightsf):
        return [
            AnsSelCB(self.grv['si0'], self.grv),
            ModelCheckpoint(weightsf, save_best_only=True),
            EarlyStopping(patience=3)
        ]

    def fit_model(self, model, **kwargs):
        batch_size = kwargs.pop('batch_size')
        kwargs['callbacks'] = self.fit_callbacks(kwargs.pop('weightsf'))
        return model.fit_generator(self.sample_pairs(batch_size), **kwargs)

    def eval(self, model):
        res = [None]
        for gr, fname in [(self.grv, self.valf), (self.grt, self.testf)]:
            if gr is None:
                res.append(None)
                continue
            ypred = model.predict(gr)['score'][:, 0]
            res.append(ev.eval_ubuntu(ypred, gr['si0'], gr['score'], fname))
        return tuple(res)

    def res_columns(self, mres, pfx=' '):
        """ Produce README-format markdown table row piece summarizing
        important statistics """
        return (
            '%s%.6f |%s%.6f |%s%.6f  |%s%.6f |%s%.6f  |%s%.6f  ' %
            (pfx, mres[self.valf]['MRR'], pfx, mres[self.valf]['R10_1'], pfx,
             mres[self.valf]['R10_5'], pfx, mres[self.testf].get(
                 'MRR', np.nan), pfx, mres[self.testf].get('R10_1', np.nan),
             pfx, mres[self.testf].get('R10_5', np.nan)))
예제 #28
0
파일: asku.py 프로젝트: GALI472/dataset-sts
 def load_vocab(self, vocabf):
     self.texts = loader.load_askubuntu_texts(vocabf)
     self.vocab = Vocabulary(self.texts.values())
     return self.vocab
예제 #29
0
파일: asku.py 프로젝트: GALI472/dataset-sts
class AskUTask(ParaphrasingTask):
    def __init__(self):
        self.name = 'asku'
        self.s0pad = 60
        self.s1pad = 60
        self.emb = None
        self.vocab = None

    def config(self, c):
        c['loss'] = ranknet
        c['nb_epoch'] = 16
        c['batch_size'] = 192
        c['epoch_fract'] = 1/4

    def load_vocab(self, vocabf):
        self.texts = loader.load_askubuntu_texts(vocabf)
        self.vocab = Vocabulary(self.texts.values())
        return self.vocab

    def load_set(self, fname, cache_dir=None):
        links = loader.load_askubuntu_q(fname)
        return links

    def link_to_s(self, link):
        # convert link in the askubuntu_q format to a set of sentence pairs
        pid, qids, qlabels = link
        s0 = []
        s1 = []
        labels = []
        for qid, ql in zip(qids, qlabels):
            s0.append(self.texts[pid])
            s1.append(self.texts[qid])
            labels.append(ql)
        return s0, s1, labels

    def links_to_graph(self, links):
        s0 = []
        s1 = []
        labels = []
        for link in links:
            s0l, s1l, labelsl = self.link_to_s(link)
            s0 += s0l
            s1 += s1l
            labels += labelsl

        si0 = self.vocab.vectorize(s0, spad=self.s0pad)
        si1 = self.vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)

        gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1)
        return gr

    def load_data(self, trainf, valf, testf=None):
        self.trainf = trainf
        self.valf = valf
        self.testf = testf

        if self.vocab is None:
            # XXX: this vocab includes even val,test words!
            self.load_vocab(os.path.dirname(trainf) + '/text_tokenized.txt.gz')

        self.links = self.load_set(trainf)
        from itertools import chain
        self.gr = {'score': list(chain.from_iterable([l[2] for l in self.links]))}
        print('Training set: %d links, %d sentence pairs' % (len(self.links), len(self.gr['score'])))
        self.linksv = self.load_set(valf)
        self.grv = self.links_to_graph(self.linksv)
        if testf is not None:
            self.linkst = self.load_set(testf)
            self.grt = self.links_to_graph(self.linksv)
        else:
            self.linkst = None

    def sample_pairs(self, batch_size, once=False):
        """ A generator that produces random pairs from the dataset """
        ids = range(len(self.links))
        while True:
            random.shuffle(ids)
            links_to_yield = []
            n_yielded = 0
            for i in ids:
                link = self.links[i]
                links_to_yield.append(link)
                n_yielded += len(link[1])

                if n_yielded < batch_size:
                    continue

                # we have accumulated enough pairs, produce a graph
                ogr = self.links_to_graph(links_to_yield)
                links_to_yield = []
                n_yielded = 0
                yield ogr
            if once:
                break

    def fit_callbacks(self, weightsf):
        return [AnsSelCB(self.grv['si0'], self.grv),
                ModelCheckpoint(weightsf, save_best_only=True),
                EarlyStopping(patience=3)]

    def fit_model(self, model, **kwargs):
        batch_size = kwargs.pop('batch_size')
        kwargs['callbacks'] = self.fit_callbacks(kwargs.pop('weightsf'))
        return model.fit_generator(self.sample_pairs(batch_size), **kwargs)

    def eval(self, model):
        res = [None]
        for gr, fname in [(self.grv, self.valf), (self.grt, self.testf)]:
            if gr is None:
                res.append(None)
                continue
            ypred = model.predict(gr)['score'][:,0]
            res.append(ev.eval_ubuntu(ypred, gr['si0'], gr['score'], fname))
        return tuple(res)

    def res_columns(self, mres, pfx=' '):
        """ Produce README-format markdown table row piece summarizing
        important statistics """
        return('%s%.6f |%s%.6f |%s%.6f  |%s%.6f |%s%.6f  |%s%.6f  '
               % (pfx, mres[self.valf]['MRR'],
                  pfx, mres[self.valf]['R10_1'],
                  pfx, mres[self.valf]['R10_5'],
                  pfx, mres[self.testf].get('MRR', np.nan),
                  pfx, mres[self.testf].get('R10_1', np.nan),
                  pfx, mres[self.testf].get('R10_5', np.nan)))
예제 #30
0
            labels.append(int(label))
            i += 1
            if i > MAX_SAMPLES:
                break

    return (s0i, s1i, f0, f1, labels)


if __name__ == "__main__":
    args = sys.argv[1:]
    if args[0] == '--revocab':
        revocab = True
        args = args[1:]
    else:
        revocab = False

    dataf, dumpf, vocabf = args

    if revocab:
        vocab = Vocabulary(sentence_gen(dataf), count_thres=2)
        print('%d words' % (len(vocab.word_idx)))
        pickle.dump(vocab, open(vocabf, "wb"))
    else:
        vocab = pickle.load(open(vocabf, "rb"))
        print('%d words' % (len(vocab.word_idx)))

    s0i, s1i, f0, f1, labels = load_set(dataf, vocab)
    pickle.dump((s0i, s1i, f0, f1, labels), open(dumpf, "wb"))

    # glove = emb.GloVe(N=300)
예제 #31
0
파일: hypev.py 프로젝트: brmson/dataset-sts
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print('Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
예제 #32
0
파일: asku.py 프로젝트: brmson/dataset-sts
 def load_vocab(self, vocabf):
     self.texts = loader.load_askubuntu_texts(vocabf)
     self.vocab = Vocabulary(self.texts.values(), prune_N=self.c['embprune'], icase=self.c['embicase'])
     return self.vocab