Exemplo n.º 1
0
def load_benchmark(data, vocab, extend_with=0):

    # Decode data
    try:
        texts = [x for x in data['text']]
    except UnicodeDecodeError:
        texts = [x for x in data['text']]

    # Extract labels
    labels = [x for x in data['sent']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels,
                                                   extend_with=extend_with)
    return {
        'texts': texts,
        'labels': labels,
        'added': added,
        'batch_size': batch_size,
        'maxlen': maxlen
    }
Exemplo n.º 2
0
def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """
    # Pre-processing dataset
    with open(path) as dataset:
        data = pickle.load(dataset)

    # Decode data
    try:
        texts = [unicode(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    # Extract labels
    labels = [x['label'] for x in data['info']]

    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(
        texts,
        labels, [data['train_ind'], data['val_ind'], data['test_ind']],
        extend_with=extend_with)
    return {
        'texts': texts,
        'labels': labels,
        'added': added,
        'batch_size': batch_size,
        'maxlen': maxlen
    }
Exemplo n.º 3
0
def print_examples_sentences(filename: str = "example.txt",
                             output_filename: str = "output.txt"):
    examples = read_examples(filename)

    with open(output_filename, 'w') as f:
        for example in examples:
            sentences = SentenceTokenizer().tokenize(example)

            for sentence in sentences:
                f.write(sentence + "\n")

        f.close()
Exemplo n.º 4
0
def load_model_1():
    # load the pre-trained Keras model (here we are using a model
    # pre-trained on ImageNet and provided by Keras, but you can
    # substitute in your own networks just as easily)
    global model
    #model = ResNet50(weights="imagenet")
    """
	.h5 created externally and sent here. The attentionlayer from .py 
	"""
    model = load_model(
        'emoji.h5',
        custom_objects={'AttentionWeightedAverage': AttentionWeightedAverage},
        compile=True)

    global st
    with open("vocabulary.json", 'r') as f:
        vocabulary = json.load(f)
    st = SentenceTokenizer(vocabulary, 30)
Exemplo n.º 5
0
from sentence_tokenizer import SentenceTokenizer
import json
import numpy as np


def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]


maxlen = 50
batch_size = 32

vocabulary = json.load(open('servers/deepmoji/data/vocabulary.json', 'r'))

st = SentenceTokenizer(vocabulary, maxlen)
model = deepmoji_emojis(maxlen, 'servers/deepmoji/data/deepmoji_weights.hdf5')

#print('Ready')
while True:
    sentence = input()
    tokenized, _, _ = st.tokenize_sentences([sentence])
    prob = model.predict(tokenized)[0]
    scores = []
    """
    t_token = tokenized[0]
    t_score = [sentence]
    t_prob = prob[0]
    ind_top = top_elements(t_prob, 5)
    t_score.append(sum(t_prob[ind_top]))
    t_score.extend(ind_top)
Exemplo n.º 6
0
    # load data
    data_pair = load_data(data_path)

    # split 5 fold
    data_5fold = prepare_5fold(data_pair)

    # load vocabulary and label2index dict
    with open(vocab_path, "r") as f_vocab:
        vocabulary = json.load(f_vocab)
    with open(label2index_path, "r") as f_label:
        label2index = json.load(f_label)
    index2label = {i: l for (l, i) in label2index.items()}

    # sentence tokenizer (MAXLEN means the max length of input text)
    st = SentenceTokenizer(vocabulary, MAX_LEN)
    fold = 0

    # 5 fold
    for item in data_5fold:
        # prepare training, validation, testing set
        train_text = [p[0] for p in item[0]]
        train_label = [p[1] for p in item[0]]
        test_text = [p[0] for p in item[1]]
        test_label = [p[1] for p in item[1]]

        train_X, _, _ = st.tokenize_sentences(train_text)
        test_X, _, _ = st.tokenize_sentences(test_text)
        train_y = np.array([label2index[l] for l in train_label])
        test_y = np.array([label2index[l] for l in test_label])
Exemplo n.º 7
0
def load_benchmark(path, vocab, extend_with=0):
    """ Loads the given benchmark dataset.

        Tokenizes the texts using the provided vocabulary, extending it with
        words from the training dataset if extend_with > 0. Splits them into
        three lists: training, validation and testing (in that order).

        Also calculates the maximum length of the texts and the
        suggested batch_size.

    # Arguments:
        path: Path to the dataset to be loaded.
        vocab: Vocabulary to be used for tokenizing texts.
        extend_with: If > 0, the vocabulary will be extended with up to
            extend_with tokens from the training set before tokenizing.

    # Returns:
        A dictionary with the following fields:
            texts: List of three lists, containing tokenized inputs for
                training, validation and testing (in that order).
            labels: List of three lists, containing labels for training,
                validation and testing (in that order).
            added: Number of tokens added to the vocabulary.
            batch_size: Batch size.
            maxlen: Maximum length of an input.
    """
    # Pre-processing dataset
    f = open(path, 'r')
    texts = []
    labels = []
    for line in f:
        line = line.strip().split('\t')
        if len(line) < 2:
            continue
        texts.append(line[0])
        labels.append(int(line[1]))

    # Decode data

#  try:
#      texts = [unicode(x) for x in data['texts']]
#  except UnicodeDecodeError:
#      texts = [x.decode('utf-8') for x in data['texts']]

# Extract labels
# labels = [x['label'] for x in data['info']]
    print(texts[0:10])
    print(labels[0:10])
    batch_size, maxlen = calculate_batchsize_maxlen(texts)

    st = SentenceTokenizer(vocab, maxlen)

    # Split up dataset. Extend the existing vocabulary with up to extend_with
    # tokens from the training dataset.
    texts, labels, added = st.split_train_val_test(texts,
                                                   labels, [0.88, 0.1, 0.02],
                                                   extend_with=extend_with)
    return {
        'texts': texts,
        'labels': labels,
        'added': added,
        'batch_size': batch_size,
        'maxlen': maxlen
    }
Exemplo n.º 8
0
 raw_dpath = os.path.join(os.getenv("OPIE_DIR"), "data/raw/domains")
 domain_path = os.path.join(os.getenv("OPIE_DIR"), "data/domains", domain)
 pickle_head = "/pickles/without_parse_sentences/without_parse_sentences_"
 fname = domain + ".json.gz"
 mkdir(domain_path)
 spath = os.path.join(domain_path, "sentences")
 if os.path.exists(spath):
     shutil.rmtree(spath)
 mkdir(os.path.join(domain_path, "sentences"))
 mkdir(os.path.join(domain_path, "pickles"))
 mkdir(os.path.join(domain_path, "pickles/without_parse_sentences"))
 sentences, i, k, review_index = [], 0, 1, 1
 kk = 1
 f = open(domain_path + "/sentences/sentences_1.txt", "a", encoding="utf8")
 flag = False
 myTokenizer = SentenceTokenizer()
 for e in parse(os.path.join(raw_dpath, fname)):
     text, score = e['reviewText'], float(e['overall'])
     # 去除所有的控制字符,防止parse出错
     text = re.sub(r'[\x00-\x1f]', '', text)
     sents = myTokenizer.segment_text(text)
     for sent in sents:
         t = Sentence()
         t.set_text_score_review(sent, score, review_index)
         if len(sent.split(' ')) > 50:
             continue
         sentences.append(t)
         print(sent, file=f)
         # 60000个句子序列化一次
         if len(sentences) == 60000:
             save_pickle_file(domain_path + pickle_head +