def run_online(device):
    # predict labels online

    for l in sys.stdin:
        l = l.strip()
        l_lst = l.split('\t')
        if not l or l_lst < 2:
            print('# blank line')
            continue

        text1 = nlp_utils.normalize_text(l_lst[0])
        text2 = nlp_utils.normalize_text(l_lst[1])

        words1 = nlp_utils.split_text(text1, char_based=setup['char_based'])
        words2 = nlp_utils.split_text(text2, char_based=setup['char_based'])

        xs = nlp_utils.transform_to_array2([[words1, words2]],
                                           vocab,
                                           with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)

        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs['xs1'], xs['xs2'], softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}\t{}'.format(answer, score, ' '.join(words1),
                                          ' '.join(words2)))
Пример #2
0
def run_batch(device, batchsize=64):
    # predict labels by batch

    def predict_batch(words_batch):
        xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            probs = model.predict(xs, softmax=True)
        answers = model.xp.argmax(probs, axis=1)
        scores = probs[model.xp.arange(answers.size), answers].tolist()
        for words, answer, score in zip(words_batch, answers, scores):
            print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))

    batch = []
    for l in sys.stdin:
        l = l.strip()
        if not l:
            if batch:
                predict_batch(batch)
                batch = []
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        batch.append(words)
        if len(batch) >= batchsize:
            predict_batch(batch)
            batch = []
    if batch:
        predict_batch(batch)
Пример #3
0
def run_batch(device, batchsize=64):
    # predict labels by batch

    def predict_batch(words_batch):
        xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            probs = model.predict(xs, softmax=True)
        answers = model.xp.argmax(probs, axis=1)
        scores = probs[model.xp.arange(answers.size), answers].tolist()
        for words, answer, score in zip(words_batch, answers, scores):
            print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))

    batch = []
    print('Enter inputs for Batch Predictions')
    for l in sys.stdin:
        l = l.strip()
        if not l:
            if batch:
                predict_batch(batch)
                batch = []
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        batch.append(words)
        if len(batch) >= batchsize:
            predict_batch(batch)
            batch = []
    if batch:
        predict_batch(batch)
Пример #4
0
def read_snli(path, split, shrink=1, char_based=False):
    path = os.path.join(path, 'snli_1.0_{}.jsonl'.format(split))
    dataset = []
    labels = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
    with open(path) as f:
        for i, x in enumerate(f.readlines()):            
            if i % shrink != 0:
                continue
            x = json.loads(x)
            if x['gold_label'] in labels:
                label = labels[x['gold_label']]
            else:
                label = labels[most_common(x['annotator_labels'])]
            premise = split_text(normalize_text(x['sentence1']), char_based)
            hypothesis = split_text(normalize_text(x['sentence2']), char_based)
            dataset.append((premise, hypothesis, label))
    return dataset
Пример #5
0
def read_dbpedia(tf, split, shrink=1, char_based=False):
    dataset = []
    f = tf.extractfile('dbpedia_csv/{}.csv'.format(split))
    for i, (label, title, text) in enumerate(csv.reader(f)):
        if i % shrink != 0:
            continue
        label = int(label) - 1  # Index begins from 1
        tokens = split_text(normalize_text(text), char_based)
        dataset.append((tokens, label))
    return dataset
Пример #6
0
def read_dbpedia(tf, split, shrink=1, char_based=False):
    dataset = []
    f = tf.extractfile('dbpedia_csv/{}.csv'.format(split))
    for i, (label, title, text) in enumerate(csv.reader(f)):
        if i % shrink != 0:
            continue
        label = int(label) - 1  # Index begins from 1
        tokens = split_text(normalize_text(text), char_based)
        dataset.append((tokens, label))
    return dataset
Пример #7
0
def read_other_dataset(path, shrink=1, char_based=False):
    dataset = []
    with io.open(path, encoding='utf-8', errors='ignore') as f:
        for i, l in enumerate(f):
            if i % shrink != 0 or not len(l.strip()) >= 3:
                continue
            label, text = l.strip().split(None, 1)
            label = int(label)
            tokens = split_text(normalize_text(text), char_based)
            dataset.append((tokens, label))
    return dataset
def read_dbpedia(dbpedia_dir, split, shrink=1, char_based=False):
    dataset = []
    f = open(os.path.join(dbpedia_dir, '{}.csv'.format(split)))
    for i, (label, title, text) in enumerate(csv.reader(f)):
        if i % shrink != 0:
            continue
        label = int(label) - 1  # Index begins from 1
        tokens = split_text(normalize_text(text), char_based)
        dataset.append((tokens, label))
    f.close()
    return dataset
def read_sst(sst_dir, split, shrink=1, char_based=False):
    dataset = []
    f = open(os.path.join(sst_dir, '{}.txt'.format(split)))
    for i, line in enumerate(f.readlines()):
        if i % shrink != 0:
            continue
        tree = Tree.fromstring(line)
        tokens = ' '.join(tree.leaves())
        tokens = split_text(normalize_text(tokens), char_based)
        label = int(tree.label())
        dataset.append((tokens, label))
    f.close()
    return dataset
Пример #10
0
def predict_fn(input_data, model):
    """
    This function receives a NumPy array and makes a prediction on it using the model returned
    by `model_fn`.
    
    The default predictor used by `Chainer` serializes input data to the 'npy' format:
    https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html

    The Chainer container provides an overridable pre-processing function `input_fn`
    that accepts the serialized input data and deserializes it into a NumPy array.
    `input_fn` is invoked before `predict_fn` and passes its return value to this function
    (as `input_data`)
    
    The Chainer container provides an overridable post-processing function `output_fn`
    that accepts this function's return value and serializes it back into `npy` format, which
    the Chainer predictor can deserialize back into a NumPy array on the client.

    Args:
        input_data: a numpy array containing the data serialized by the Chainer predictor
        model: the return value of `model_fn`
    Returns:
        a NumPy array containing predictions which will be returned to the client


    For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    trained_model, vocab = model

    words_batch = []
    for sentence in input_data.tolist():
        text = normalize_text(sentence)
        words = split_text(text)
        words_batch.append(words)

    xs = transform_to_array(words_batch, vocab, with_label=False)
    xs = convert_seq(xs, with_label=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        probs = trained_model.predict(xs, softmax=True)
    answers = trained_model.xp.argmax(probs, axis=1)
    scores = probs[trained_model.xp.arange(answers.size), answers].tolist()

    output = []
    for words, answer, score in zip(words_batch, answers, scores):
        output.append([' '.join(words), answer, score])

    return np.array(output)
def predict_fn(input_data, model):
    """
    This function receives a NumPy array and makes a prediction on it using the model returned
    by `model_fn`.
    
    The default predictor used by `Chainer` serializes input data to the 'npy' format:
    https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html

    The Chainer container provides an overridable pre-processing function `input_fn`
    that accepts the serialized input data and deserializes it into a NumPy array.
    `input_fn` is invoked before `predict_fn` and passes its return value to this function
    (as `input_data`)
    
    The Chainer container provides an overridable post-processing function `output_fn`
    that accepts this function's return value and serializes it back into `npy` format, which
    the Chainer predictor can deserialize back into a NumPy array on the client.

    Args:
        input_data: a numpy array containing the data serialized by the Chainer predictor
        model: the return value of `model_fn`
    Returns:
        a NumPy array containing predictions which will be returned to the client


    For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    trained_model, vocab = model

    words_batch = []
    for sentence in input_data.tolist():
        text = normalize_text(sentence)
        words = split_text(text)
        words_batch.append(words)

    xs = transform_to_array(words_batch, vocab, with_label=False)
    xs = convert_seq(xs, with_label=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        probs = trained_model.predict(xs, softmax=True)
    answers = trained_model.xp.argmax(probs, axis=1)
    scores = probs[trained_model.xp.arange(answers.size), answers].tolist()

    output = []
    for words, answer, score in zip(words_batch, answers, scores):
        output.append([' '.join(words), answer, score])

    return np.array(output)
def run_batch(device, batchsize=64):
    # predict labels by batch

    def predict_batch(words_batch):
        xs = nlp_utils.transform_to_array2(words_batch,
                                           vocab,
                                           with_label=False)
        xs = nlp_utils.convert_seq2(xs, device=device, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            probs = model.predict(xs['xs1'], xs['xs2'], softmax=True)
        answers = model.xp.argmax(probs, axis=1)
        scores = probs[model.xp.arange(answers.size), answers].tolist()
        for words, answer, score in zip(words_batch, answers, scores):
            print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))

    batch = []
    for l in sys.stdin:
        l = l.strip()
        l_lst = l.split('\t')
        if not l or l_lst < 2:
            if batch:
                predict_batch(batch)
                batch = []
            print('# blank line')
            continue
        text1 = nlp_utils.normalize_text(l_lst[0])
        text2 = nlp_utils.normalize_text(l_lst[1])

        words1 = nlp_utils.split_text(text1, char_based=setup['char_based'])
        words2 = nlp_utils.split_text(text2, char_based=setup['char_based'])
        batch.append((words1, words2))
        if len(batch) >= batchsize:
            predict_batch(batch)
            batch = []
    if batch:
        predict_batch(batch)
Пример #13
0
def run_online(device):
    # predict labels online
    for l in sys.stdin:
        l = l.strip()
        if not l:
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs, softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
Пример #14
0
 def read_and_label(posneg, label):
     dataset = []
     target = os.path.join(path, 'aclImdb', split, posneg, '*')
     for i, f_path in enumerate(glob.glob(target)):
         if i % shrink != 0:
             continue
         with io.open(f_path, encoding='utf-8', errors='ignore') as f:
             text = f.read().strip()
         tokens = split_text(normalize_text(text), char_based)
         if fine_grained:
             # extract from f_path. e.g. /pos/200_8.txt -> 8
             label = fg_label_dict[f_path.split('_')[-1][:-4]]
             dataset.append((tokens, label))
         else:
             dataset.append((tokens, label))
     return dataset
Пример #15
0
def run_online(gpu):
    # predict labels online
    for l in sys.stdin:
        l = l.strip()
        if not l:
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=gpu, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs, softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))