def get_other_text_dataset(name, vocab=None, shrink=1, char_based=False, seed=777): assert (name in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]) datasets = download_other_dataset(name) train = read_other_dataset(datasets[0], shrink=shrink, char_based=char_based) if len(datasets) == 2: test = read_other_dataset(datasets[1], shrink=shrink, char_based=char_based) else: numpy.random.seed(seed) alldata = numpy.random.permutation(train) train = alldata[:-len(alldata) // 10] test = alldata[-len(alldata) // 10:] if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_imdb(vocab=None, shrink=1, fine_grained=False, char_based=False): tmp_path = download_imdb() print('read imdb') train = read_imdb(tmp_path, 'train', shrink=shrink, fine_grained=fine_grained, char_based=char_based) test = read_imdb(tmp_path, 'test', shrink=shrink, fine_grained=fine_grained, char_based=char_based) shutil.rmtree(tmp_path) if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_dbpedia(vocab=None, shrink=1, char_based=False): tf = download_dbpedia() print('read dbpedia') train = read_dbpedia(tf, 'train', shrink=shrink, char_based=char_based) test = read_dbpedia(tf, 'test', shrink=shrink, char_based=char_based) if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_quizbowl(data_dir='data/nn_guesser', split_sentences=True, num_answers=-1, min_answer_freq=-1): if not os.path.exists(data_dir): os.mkdir(data_dir) train_dir = os.path.join(data_dir, 'train.json') dev_dir = os.path.join(data_dir, 'dev.json') answers_dir = os.path.join(data_dir, 'answers.json') existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]] if all(existance): with open(train_dir, 'r') as f: train = json.loads(f.read()) with open(dev_dir, 'r') as f: dev = json.loads(f.read()) with open(answers_dir, 'r') as f: answers = json.loads(f.read()) else: train, dev, answers = load_quizbowl(split_sentences, num_answers, min_answer_freq) with open(train_dir, 'w') as f: f.write(json.dumps(train)) with open(dev_dir, 'w') as f: f.write(json.dumps(dev)) with open(answers_dir, 'w') as f: f.write(json.dumps(answers)) print('# train data: {}'.format(len(train))) print('# dev data: {}'.format(len(dev))) print('# class: {}'.format(len(answers))) vocab_dir = os.path.join(data_dir, 'vocab.json') if os.path.isfile(vocab_dir): with open(vocab_dir, 'r') as f: vocab = json.loads(f.read()) else: vocab = make_vocab(train) with open(vocab_dir, 'w') as f: f.write(json.dumps(vocab)) print('# vocab: {}'.format(len(vocab))) train = transform_to_array(train, vocab) dev = transform_to_array(dev, vocab) return train, dev, vocab, answers
def get_sst(vocab=None, shrink=1, char_based=False): sst_dir = os.path.join(DATA_DIR, 'trees') if not os.path.exists(sst_dir): download_sst() print('read sst') train = read_sst(sst_dir, 'train', shrink=shrink, char_based=char_based) test = read_sst(sst_dir, 'dev', shrink=shrink, char_based=char_based) if vocab is None: print('construct vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_quizbowl(data_dir='data/nn_guesser', split_sentences=True, num_answers=-1, min_answer_freq=-1): if not os.path.exists(data_dir): os.mkdir(data_dir) train_dir = os.path.join(data_dir, 'train.json') dev_dir = os.path.join(data_dir, 'dev.json') answers_dir = os.path.join(data_dir, 'answers.json') existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]] if all(existance): with open(train_dir, 'r') as f: train = json.loads(f.read()) with open(dev_dir, 'r') as f: dev = json.loads(f.read()) with open(answers_dir, 'r') as f: answers = json.loads(f.read()) else: train, dev, answers = load_quizbowl( split_sentences, num_answers, min_answer_freq) with open(train_dir, 'w') as f: f.write(json.dumps(train)) with open(dev_dir, 'w') as f: f.write(json.dumps(dev)) with open(answers_dir, 'w') as f: f.write(json.dumps(answers)) print('# train data: {}'.format(len(train))) print('# dev data: {}'.format(len(dev))) print('# class: {}'.format(len(answers))) vocab_dir = os.path.join(data_dir, 'vocab.json') if os.path.isfile(vocab_dir): with open(vocab_dir, 'r') as f: vocab = json.loads(f.read()) else: vocab = make_vocab(train) with open(vocab_dir, 'w') as f: f.write(json.dumps(vocab)) print('# vocab: {}'.format(len(vocab))) train = transform_to_array(train, vocab) dev = transform_to_array(dev, vocab) return train, dev, vocab, answers
def get_snli(vocab=None, shrink=1, char_based=False): path = download_snli() print('read snli') path = 'snli_1.0' train = read_snli(path, 'train', shrink=shrink, char_based=char_based) test = read_snli(path, 'dev', shrink=shrink, char_based=char_based) if vocab is None: print('construct vocabulary based on frequency') train_premise = [(x, z) for x, y, z in train] train_hypothesis = [(y, z) for x, y, z in train] train_all = train_premise + train_hypothesis vocab = make_vocab(train_all) train = transform_snli_to_array(train, vocab, combine=combine) test = transform_snli_to_array(test, vocab, combine=combine) return train, test, vocab
def get_snli(vocab=None, shrink=1, char_based=False): snli_dir = os.path.join(DATA_DIR, 'snli_1.0') if not os.path.exists(snli_dir): download_snli() print('read snli') train = read_snli(snli_dir, 'train', shrink=shrink, char_based=char_based) test = read_snli(snli_dir, 'dev', shrink=shrink, char_based=char_based) if vocab is None: print('construct vocabulary based on frequency') train_premise = [(x, z) for x, y, z in train] train_hypothesis = [(y, z) for x, y, z in train] vocab = make_vocab(train_premise + train_hypothesis) train = transform_snli_to_array(train, vocab) test = transform_snli_to_array(test, vocab) return train, test, vocab
def get_imdb(vocab=None, shrink=1, fine_grained=False, char_based=False): imdb_path = os.path.join(DATA_DIR, 'aclImdb') if not os.path.exists(imdb_path): download_imdb() print('read imdb') train = read_imdb(DATA_DIR, 'train', shrink=shrink, fine_grained=fine_grained, char_based=char_based) test = read_imdb(DATA_DIR, 'test', shrink=shrink, fine_grained=fine_grained, char_based=char_based) if vocab is None: print('construct vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_other_text_dataset(name, vocab=None, shrink=1, char_based=False, seed=777): assert(name in ['TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj']) datasets = download_other_dataset(name) train = read_other_dataset( datasets[0], shrink=shrink, char_based=char_based) if len(datasets) == 2: test = read_other_dataset( datasets[1], shrink=shrink, char_based=char_based) else: numpy.random.seed(seed) alldata = numpy.random.permutation(train) train = alldata[:-len(alldata) // 10] test = alldata[-len(alldata) // 10:] if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab