def get_imdb(vocab=None, shrink=1, fine_grained=False, char_based=False): tmp_path = download_imdb() print('read imdb') train = read_imdb(tmp_path, 'train', shrink=shrink, fine_grained=fine_grained, char_based=char_based) test = read_imdb(tmp_path, 'test', shrink=shrink, fine_grained=fine_grained, char_based=char_based) shutil.rmtree(tmp_path) if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_other_text_dataset(name, vocab=None, shrink=1, char_based=False, seed=777): assert (name in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]) datasets = download_other_dataset(name) train = read_other_dataset(datasets[0], shrink=shrink, char_based=char_based) if len(datasets) == 2: test = read_other_dataset(datasets[1], shrink=shrink, char_based=char_based) else: numpy.random.seed(seed) alldata = numpy.random.permutation(train) train = alldata[:-len(alldata) // 10] test = alldata[-len(alldata) // 10:] if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_dbpedia(vocab=None, shrink=1, char_based=False): tf = download_dbpedia() print('read dbpedia') train = read_dbpedia(tf, 'train', shrink=shrink, char_based=char_based) test = read_dbpedia(tf, 'test', shrink=shrink, char_based=char_based) if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_quizbowl(data_dir='data/nn_guesser', split_sentences=True, num_answers=-1, min_answer_freq=-1): if not os.path.exists(data_dir): os.mkdir(data_dir) train_dir = os.path.join(data_dir, 'train.json') dev_dir = os.path.join(data_dir, 'dev.json') answers_dir = os.path.join(data_dir, 'answers.json') existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]] if all(existance): with open(train_dir, 'r') as f: train = json.loads(f.read()) with open(dev_dir, 'r') as f: dev = json.loads(f.read()) with open(answers_dir, 'r') as f: answers = json.loads(f.read()) else: train, dev, answers = load_quizbowl(split_sentences, num_answers, min_answer_freq) with open(train_dir, 'w') as f: f.write(json.dumps(train)) with open(dev_dir, 'w') as f: f.write(json.dumps(dev)) with open(answers_dir, 'w') as f: f.write(json.dumps(answers)) print('# train data: {}'.format(len(train))) print('# dev data: {}'.format(len(dev))) print('# class: {}'.format(len(answers))) vocab_dir = os.path.join(data_dir, 'vocab.json') if os.path.isfile(vocab_dir): with open(vocab_dir, 'r') as f: vocab = json.loads(f.read()) else: vocab = make_vocab(train) with open(vocab_dir, 'w') as f: f.write(json.dumps(vocab)) print('# vocab: {}'.format(len(vocab))) train = transform_to_array(train, vocab) dev = transform_to_array(dev, vocab) return train, dev, vocab, answers
def convert_into_features_using_vocab(sentences, vocab): contents = [] for doc_id, sent, tokens in sentences: features = [token['lemma'] for token in tokens] contents.append(features) features = transform_to_array(contents, vocab, with_label=False) return features
def get_sst(vocab=None, shrink=1, char_based=False): sst_dir = os.path.join(DATA_DIR, 'trees') if not os.path.exists(sst_dir): download_sst() print('read sst') train = read_sst(sst_dir, 'train', shrink=shrink, char_based=char_based) test = read_sst(sst_dir, 'dev', shrink=shrink, char_based=char_based) if vocab is None: print('construct vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_quizbowl(data_dir='data/nn_guesser', split_sentences=True, num_answers=-1, min_answer_freq=-1): if not os.path.exists(data_dir): os.mkdir(data_dir) train_dir = os.path.join(data_dir, 'train.json') dev_dir = os.path.join(data_dir, 'dev.json') answers_dir = os.path.join(data_dir, 'answers.json') existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]] if all(existance): with open(train_dir, 'r') as f: train = json.loads(f.read()) with open(dev_dir, 'r') as f: dev = json.loads(f.read()) with open(answers_dir, 'r') as f: answers = json.loads(f.read()) else: train, dev, answers = load_quizbowl( split_sentences, num_answers, min_answer_freq) with open(train_dir, 'w') as f: f.write(json.dumps(train)) with open(dev_dir, 'w') as f: f.write(json.dumps(dev)) with open(answers_dir, 'w') as f: f.write(json.dumps(answers)) print('# train data: {}'.format(len(train))) print('# dev data: {}'.format(len(dev))) print('# class: {}'.format(len(answers))) vocab_dir = os.path.join(data_dir, 'vocab.json') if os.path.isfile(vocab_dir): with open(vocab_dir, 'r') as f: vocab = json.loads(f.read()) else: vocab = make_vocab(train) with open(vocab_dir, 'w') as f: f.write(json.dumps(vocab)) print('# vocab: {}'.format(len(vocab))) train = transform_to_array(train, vocab) dev = transform_to_array(dev, vocab) return train, dev, vocab, answers
def predict_batch(words_batch): xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): probs = model.predict(xs, softmax=True) answers = model.xp.argmax(probs, axis=1) scores = probs[model.xp.arange(answers.size), answers].tolist() for words, answer, score in zip(words_batch, answers, scores): print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
def get_imdb(vocab=None, shrink=1, fine_grained=False, char_based=False): imdb_path = os.path.join(DATA_DIR, 'aclImdb') if not os.path.exists(imdb_path): download_imdb() print('read imdb') train = read_imdb(DATA_DIR, 'train', shrink=shrink, fine_grained=fine_grained, char_based=char_based) test = read_imdb(DATA_DIR, 'test', shrink=shrink, fine_grained=fine_grained, char_based=char_based) if vocab is None: print('construct vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def predict_fn(input_data, model): """ This function receives a NumPy array and makes a prediction on it using the model returned by `model_fn`. The default predictor used by `Chainer` serializes input data to the 'npy' format: https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html The Chainer container provides an overridable pre-processing function `input_fn` that accepts the serialized input data and deserializes it into a NumPy array. `input_fn` is invoked before `predict_fn` and passes its return value to this function (as `input_data`) The Chainer container provides an overridable post-processing function `output_fn` that accepts this function's return value and serializes it back into `npy` format, which the Chainer predictor can deserialize back into a NumPy array on the client. Args: input_data: a numpy array containing the data serialized by the Chainer predictor model: the return value of `model_fn` Returns: a NumPy array containing predictions which will be returned to the client For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository: https://github.com/aws/sagemaker-python-sdk For more on the Chainer container, please visit the sagemaker-chainer-containers repository: https://github.com/aws/sagemaker-chainer-containers """ trained_model, vocab = model words_batch = [] for sentence in input_data.tolist(): text = normalize_text(sentence) words = split_text(text) words_batch.append(words) xs = transform_to_array(words_batch, vocab, with_label=False) xs = convert_seq(xs, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): probs = trained_model.predict(xs, softmax=True) answers = trained_model.xp.argmax(probs, axis=1) scores = probs[trained_model.xp.arange(answers.size), answers].tolist() output = [] for words, answer, score in zip(words_batch, answers, scores): output.append([' '.join(words), answer, score]) return np.array(output)
def get_other_text_dataset(name, vocab=None, shrink=1, char_based=False, seed=777): assert(name in ['TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj']) datasets = download_other_dataset(name) train = read_other_dataset( datasets[0], shrink=shrink, char_based=char_based) if len(datasets) == 2: test = read_other_dataset( datasets[1], shrink=shrink, char_based=char_based) else: numpy.random.seed(seed) alldata = numpy.random.permutation(train) train = alldata[:-len(alldata) // 10] test = alldata[-len(alldata) // 10:] if vocab is None: print('constract vocabulary based on frequency') vocab = make_vocab(train) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def run_online(device): # predict labels online for l in sys.stdin: l = l.strip() if not l: print('# blank line') continue text = nlp_utils.normalize_text(l) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
def run_online(gpu): # predict labels online for l in sys.stdin: l = l.strip() if not l: print('# blank line') continue text = nlp_utils.normalize_text(l) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=gpu, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))