예제 #1
0
def get_other_text_dataset(name,
                           vocab=None,
                           shrink=1,
                           char_based=False,
                           seed=777):
    assert (name in [
        'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity',
        'subj'
    ])
    datasets = download_other_dataset(name)
    train = read_other_dataset(datasets[0],
                               shrink=shrink,
                               char_based=char_based)
    if len(datasets) == 2:
        test = read_other_dataset(datasets[1],
                                  shrink=shrink,
                                  char_based=char_based)
    else:
        numpy.random.seed(seed)
        alldata = numpy.random.permutation(train)
        train = alldata[:-len(alldata) // 10]
        test = alldata[-len(alldata) // 10:]

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
예제 #2
0
def get_imdb(vocab=None, shrink=1, fine_grained=False, char_based=False):
    tmp_path = download_imdb()

    print('read imdb')
    train = read_imdb(tmp_path,
                      'train',
                      shrink=shrink,
                      fine_grained=fine_grained,
                      char_based=char_based)
    test = read_imdb(tmp_path,
                     'test',
                     shrink=shrink,
                     fine_grained=fine_grained,
                     char_based=char_based)

    shutil.rmtree(tmp_path)

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
예제 #3
0
def get_dbpedia(vocab=None, shrink=1, char_based=False):
    tf = download_dbpedia()

    print('read dbpedia')
    train = read_dbpedia(tf, 'train', shrink=shrink, char_based=char_based)
    test = read_dbpedia(tf, 'test', shrink=shrink, char_based=char_based)

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
예제 #4
0
def get_quizbowl(data_dir='data/nn_guesser',
                 split_sentences=True,
                 num_answers=-1,
                 min_answer_freq=-1):
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    train_dir = os.path.join(data_dir, 'train.json')
    dev_dir = os.path.join(data_dir, 'dev.json')
    answers_dir = os.path.join(data_dir, 'answers.json')
    existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]]

    if all(existance):
        with open(train_dir, 'r') as f:
            train = json.loads(f.read())
        with open(dev_dir, 'r') as f:
            dev = json.loads(f.read())
        with open(answers_dir, 'r') as f:
            answers = json.loads(f.read())
    else:
        train, dev, answers = load_quizbowl(split_sentences, num_answers,
                                            min_answer_freq)
        with open(train_dir, 'w') as f:
            f.write(json.dumps(train))
        with open(dev_dir, 'w') as f:
            f.write(json.dumps(dev))
        with open(answers_dir, 'w') as f:
            f.write(json.dumps(answers))

    print('# train data: {}'.format(len(train)))
    print('# dev data: {}'.format(len(dev)))
    print('# class: {}'.format(len(answers)))

    vocab_dir = os.path.join(data_dir, 'vocab.json')
    if os.path.isfile(vocab_dir):
        with open(vocab_dir, 'r') as f:
            vocab = json.loads(f.read())
    else:
        vocab = make_vocab(train)
        with open(vocab_dir, 'w') as f:
            f.write(json.dumps(vocab))

    print('# vocab: {}'.format(len(vocab)))

    train = transform_to_array(train, vocab)
    dev = transform_to_array(dev, vocab)

    return train, dev, vocab, answers
def get_sst(vocab=None, shrink=1, char_based=False):
    sst_dir = os.path.join(DATA_DIR, 'trees')
    if not os.path.exists(sst_dir):
        download_sst()

    print('read sst')
    train = read_sst(sst_dir, 'train', shrink=shrink, char_based=char_based)
    test = read_sst(sst_dir, 'dev', shrink=shrink, char_based=char_based)

    if vocab is None:
        print('construct vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
예제 #6
0
파일: dataset.py 프로젝트: Pinafore/qb
def get_quizbowl(data_dir='data/nn_guesser', split_sentences=True,
        num_answers=-1, min_answer_freq=-1):
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    train_dir = os.path.join(data_dir, 'train.json')
    dev_dir = os.path.join(data_dir, 'dev.json')
    answers_dir = os.path.join(data_dir, 'answers.json')
    existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]]

    if all(existance):
        with open(train_dir, 'r') as f:
            train = json.loads(f.read())
        with open(dev_dir, 'r') as f:
            dev = json.loads(f.read())
        with open(answers_dir, 'r') as f:
            answers = json.loads(f.read())
    else:
        train, dev, answers = load_quizbowl(
                split_sentences, num_answers, min_answer_freq)
        with open(train_dir, 'w') as f:
            f.write(json.dumps(train))
        with open(dev_dir, 'w') as f:
            f.write(json.dumps(dev))
        with open(answers_dir, 'w') as f:
            f.write(json.dumps(answers))

    print('# train data: {}'.format(len(train)))
    print('# dev data: {}'.format(len(dev)))
    print('# class: {}'.format(len(answers)))

    vocab_dir = os.path.join(data_dir, 'vocab.json')
    if os.path.isfile(vocab_dir):
        with open(vocab_dir, 'r') as f:
            vocab = json.loads(f.read())
    else:
        vocab = make_vocab(train)
        with open(vocab_dir, 'w') as f:
            f.write(json.dumps(vocab))

    print('# vocab: {}'.format(len(vocab)))

    train = transform_to_array(train, vocab)
    dev = transform_to_array(dev, vocab)

    return train, dev, vocab, answers
예제 #7
0
def get_snli(vocab=None, shrink=1, char_based=False):
    path = download_snli()

    print('read snli')
    path = 'snli_1.0'
    train = read_snli(path, 'train', shrink=shrink, char_based=char_based)
    test = read_snli(path, 'dev', shrink=shrink, char_based=char_based)

    if vocab is None:
        print('construct vocabulary based on frequency')
        train_premise = [(x, z) for x, y, z in train]
        train_hypothesis = [(y, z) for x, y, z in train]
        train_all = train_premise + train_hypothesis
        vocab = make_vocab(train_all)

    train = transform_snli_to_array(train, vocab, combine=combine)
    test = transform_snli_to_array(test, vocab, combine=combine)

    return train, test, vocab
def get_snli(vocab=None, shrink=1, char_based=False):
    snli_dir = os.path.join(DATA_DIR, 'snli_1.0')
    if not os.path.exists(snli_dir):
        download_snli()

    print('read snli')
    train = read_snli(snli_dir, 'train', shrink=shrink, char_based=char_based)
    test = read_snli(snli_dir, 'dev', shrink=shrink, char_based=char_based)

    if vocab is None:
        print('construct vocabulary based on frequency')
        train_premise = [(x, z) for x, y, z in train]
        train_hypothesis = [(y, z) for x, y, z in train]
        vocab = make_vocab(train_premise + train_hypothesis)

    train = transform_snli_to_array(train, vocab)
    test = transform_snli_to_array(test, vocab)

    return train, test, vocab
예제 #9
0
def get_imdb(vocab=None, shrink=1, fine_grained=False,
             char_based=False):
    tmp_path = download_imdb()

    print('read imdb')
    train = read_imdb(tmp_path, 'train',
                      shrink=shrink, fine_grained=fine_grained,
                      char_based=char_based)
    test = read_imdb(tmp_path, 'test',
                     shrink=shrink, fine_grained=fine_grained,
                     char_based=char_based)

    shutil.rmtree(tmp_path)

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
예제 #10
0
def get_imdb(vocab=None, shrink=1, fine_grained=False,
             char_based=False):
    imdb_path = os.path.join(DATA_DIR, 'aclImdb')
    if not os.path.exists(imdb_path):
        download_imdb()

    print('read imdb')
    train = read_imdb(DATA_DIR, 'train',
                      shrink=shrink, fine_grained=fine_grained,
                      char_based=char_based)
    test = read_imdb(DATA_DIR, 'test',
                     shrink=shrink, fine_grained=fine_grained,
                     char_based=char_based)

    if vocab is None:
        print('construct vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
예제 #11
0
def get_other_text_dataset(name, vocab=None, shrink=1,
                           char_based=False, seed=777):
    assert(name in ['TREC', 'stsa.binary', 'stsa.fine',
                    'custrev', 'mpqa', 'rt-polarity', 'subj'])
    datasets = download_other_dataset(name)
    train = read_other_dataset(
        datasets[0], shrink=shrink, char_based=char_based)
    if len(datasets) == 2:
        test = read_other_dataset(
            datasets[1], shrink=shrink, char_based=char_based)
    else:
        numpy.random.seed(seed)
        alldata = numpy.random.permutation(train)
        train = alldata[:-len(alldata) // 10]
        test = alldata[-len(alldata) // 10:]

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab