示例#1
0
def main():
    verify_cwd()
    if not os.path.exists("./data/"):
        print("creating folder...")
        os.mkdir("./data/")
    if not os.path.exists("./data/trees/"):
        print("preparing sentiment treebank...")
        try:
            pytreebank.load_sst("./data/")
        except:
            pass  # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored.

    if not os.path.exists("./data/text8.zip"):
        print("retrieving text8...")
        urllib.request.urlretrieve("http://mattmahoney.net/dc/text8.zip",
                                   "./data/text8.zip")
    if not os.path.exists("./data/text8"):
        print("extracting text8...")
        with zipfile.ZipFile("./data/text8.zip", "r") as zip_ref:
            zip_ref.extractall("./data/")
    if not os.path.exists("./data/word2vec.model"):
        print("training word2vec...")
        train_word2vec()
    #if not os.path.exists("./data/glove.model"): glove training is slow. You should call it manually on create_pretrain_model.py
    #    print("training glove...")
    #    train_glove()
    print("=== ALL CLEAR! ===")
def main():
    verify_cwd()
    try:
        pytreebank.load_sst("./data/")
    except:
        pass  # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored.
    train_data = pytreebank.import_tree_corpus("./data/trees/train.txt")
    assert (str(train_data[0]) == TARGET_STRING), "test fail for pytreebank."
    print("Correctness verified.")
示例#3
0
    def build_vocab(
        self, cut_off
    ):  #this is basically useless, since glove already has the most common words
        word_count = {}
        data = pytreebank.load_sst(self.path)

        for phrase in data['train']:
            phrase.lowercase()
            _, sentence = phrase.to_labeled_lines()[0]

            for word in sentence.split():
                #check if stop word and ignore
                if word in word_count:
                    word_count[word] += 1
                else:
                    word_count[word] = 1

        filter_word_count = [
            word for word, count in word_count.items() if count < cut_off
        ]
        word2id = {word: i + 1 for i, word in enumerate(filter_word_count)}

        pickle.dump(word2id, open(self.word2id_path, 'wb'))

        return word2id
示例#4
0
def preprocess_bert(partition='train'):
    sst_data = pytreebank.load_sst()
    # Load the dataset and vectorize it
    train_set = sst_data[partition]
    x_list, y_list = tree_to_bert(train_set)
    print("All " + partition + " samples, w/o filtering " + str(len(x_list)))

    # Filter for min sentence length
    # Setting the min length to 5, this is the 3rd percentile
    # Also, it makes sense for the sentences to be at least 5 words
    x_filtered, y_filtered = filter_minlength(x_list, y_list, min_length=5)
    print("All " + partition + " samples, w/ filtering " +
          str(len(x_filtered)))

    # Pad sequences to same length
    # Max length for the training dataset is 23, so padding to 25 to make sure
    x_padded = pad_sequences(x_filtered,
                             maxlen=25,
                             dtype='float32',
                             padding='post')
    print(x_padded.shape)

    # One-hot encode labels. This is necessary to use loss=categorical_crossentropy for training
    y_onehot = to_categorical(y_filtered, num_classes=5)

    # Save it all to .npy files
    np.save('data/x_' + partition + '_bert', x_padded)
    np.save('data/y_' + partition + '_bert', y_onehot)
示例#5
0
def load_examples_sst(input_directory, curr_set="dev", granularity="binary"):
    """
    Loads the Stanford Sentiment Treebank with binary / fine-grained labels
    :param input_directory: the dataset directory
    :param curr_set: train, test, or dev
    :param granularity: binary / fine
    :return: a list of examples
    """
    # Load dataset
    dataset = pytreebank.load_sst(input_directory)

    examples = []

    for item in dataset[curr_set]:
        root = item.to_labeled_lines()[0]
        fine_grained_label, sentence = root

        # Detokenize the sentence, i.e. remove trailing spaces etc.
        premise = detokenizer(sentence.strip())

        if not premise.endswith("."):
            premise = premise + "."

        # Fine grained
        if granularity == "fine":
            opts = [
                " very negative.", " somewhat negative.", " neutral.",
                " somewhat positive.", " very positive."
            ]
            label = fine_grained_label
        # Binary
        else:
            # Omit neutral examples from the binary version
            if fine_grained_label == 2:
                continue

            opts = [" negative.", " positive."]
            label = 0 if fine_grained_label < 2 else 1

        #premise = f"\"{d['sentence']}\" has a tone that is"

        premise = f"\"{premise}\" has a tone that is"
        options = []
        for h in opts:
            o = {}

            h = h + '<|endoftext|>'
            o['premise'] = premise
            o['hypothesis'] = h
            o['uncond_premise'] = ' The quote has a tone that is'
            o['uncond_hypothesis'] = h
            options.append(o)
        label = label
        examples.append({'options': options, 'label': label})

    return examples
def load_from_file(path):
    """
    :param path: path to trees folder
    :return:
    """
    trees = pytreebank.load_sst(path)
    raw_train = trees["train"]
    raw_dev = trees["dev"]
    raw_test = trees["test"]
    return raw_train, raw_dev, raw_test
示例#7
0
    def process(self, type, word2id):
        data = pytreebank.load_sst(self.path)
        train, labels, max_sentence_size = self._phrase2id(data[type], word2id)
        t_data, t_labels = self._create_torch_training(train, labels,
                                                       max_sentence_size)

        torch.save(t_data, self.data_path)
        torch.save(t_labels, self.labels_path)

        return t_data, t_labels
示例#8
0
    def load_dataset(self):
        """
        Load the dataset into memory.

        Returns:
        --------
            dict: loaded dataset dictionary
        """
        with mock.patch.object(utils, 'urlretrieve') as mock_urlretrieve:
            mock_urlretrieve.side_effect = add_fake_zip_file
            dataset = load_sst(self.temp_dir_save_path)
        return dataset
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--hidden_size", type=int, default=256)
    args = parser.parse_args()
    hps = HParams(
        nhidden=args.hidden_size,
        nembd=64,
        nbatch=args.batch_size,
        nstates=2,
        nvocab=256,
        out_wn=False,
        rnn_wn=True,
        rnn_type='mlstm',
        embd_wn=True,
    )
    # params = [np.load('model/%d.npy' % i) for i in range(15)]
    # params[2] = np.concatenate(params[2:6], axis=1)
    # params[3:6] = []

    X = tf.placeholder(tf.int32, [None, None])
    Y = tf.placeholder(tf.int32, [None, None])
    mask = tf.placeholder(tf.float32, [None, None])
    cells, states, logits = model(hps, X, reuse=False)
    loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=Y)
    loss = loss * mask
    mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
    train_op = tf.train.GradientDescentOptimizer(0.01).minimize(mean_loss)
    loss = tf.reduce_sum(loss)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession()
    tf.global_variables_initializer().run(session=sess)

    # load some data
    loaded_dataset = pytreebank.load_sst(join(DATA_DIR, "sst"))
    # labels = np.array([label for label, _ in text_data])
    text = np.array([list(preprocess(ex.to_lines()[0])) for ex in loaded_dataset['train']])
    batches_per_epoch = int(np.ceil(len(text) / args.batch_size))

    for epoch in range(args.epochs):
        t0 = time.time()
        epoch_loss = 0.0
        for i in tqdm.tqdm(range(batches_per_epoch)):
            batch_indices = np.random.choice(len(text), size=args.batch_size)
            x, y, batch_mask = batchify(text[batch_indices])
            _, batch_cost = sess.run((train_op, loss), {X: x, Y: y, mask: batch_mask})
            epoch_loss += batch_cost
        t1 = time.time()
        print("%.3f\t%.3f" % (t1 - t0, epoch_loss))
示例#10
0
 def _load_data(self) -> typing.Dict[str, typing.Any]:
     if self.data is None:
         self.data: dict = pytreebank.load_sst()
         pdframes: dict = {}
         for k_from, k_to in dict(train='train',
                                  dev='validation',
                                  test='test').items():
             labels, sentences = [], []
             for labeled_tree_obj in self.data[k_from]:
                 lab, sent = labeled_tree_obj.to_labeled_lines()[0]
                 labels += [lab]
                 sentences += [sent]
             pdframes[k_to] = pd.DataFrame(
                 dict(sentence=sentences, label=labels))
             pdframes[k_to] = self._preprocess(pdframes[k_to])
         self.data = pdframes
     return self.data
示例#11
0
def main(args):
    logger.info('loading SST data')
    dataset = pytreebank.load_sst(os.path.join(DATASET, 'sst'))

    split = ['train', 'dev', 'test']
    data = defaultdict(set)
    total_samples = 0
    for tag in split:
        for example in dataset[tag]:
            for label, sentence in example.to_labeled_lines():
                sentence = clean_sst(sentence, uncased=args.uncased)
                if args.n_class == 5:
                    if (sentence, label) not in data[tag] and sentence:
                        data[tag].add((sentence, label))
                else:
                    if label > 2:
                        if (sentence, 1) not in data[tag] and sentence:
                            data[tag].add((sentence, 1))
                    elif label < 2:
                        if (sentence, 0) not in data[tag] and sentence:
                            data[tag].add((sentence, 0))
                if args.phrase and tag == 'train':
                    continue
                else:
                    break
        logger.info('-' * 100)
        logger.info(
            f'{tag}: {len(dataset[tag])} sentences generates {len(data[tag])} examples.'
        )
        total_samples += len(data[tag])
        logger.info('calculate sentence statistics')
        data[tag] = list(data[tag])
        for i, (sent, label) in enumerate(data[tag]):
            data[tag][i] = (sent.split(), label)

        stat([sent for sent, label in data[tag]])
        sent_file = os.path.join(DATASET, 'sst',
                                 f'sst{args.n_class}_{tag}.json')
        logger.info(f'write samples to {sent_file}')

        with open(sent_file, 'w') as f:
            for sent, label in data[tag]:
                line = json.dumps({'label': label, 'sent': sent})
                f.write(line + '\n')
    logger.info('-' * 100)
示例#12
0
def main(outdir="SST",
         fine=False,
         mergedev=False,
         encoding='utf-8',
         minlen=2,
         neutrals=False,
         tsv=False):
    path = Path(outdir)
    csvext = '.csv'
    if tsv: csvext = '.tsv'

    def csv4(x):
        r = '%s/%s%s' % (outdir, x, csvext)
        log(r)
        return r

    if not path.exists():
        path.mkdir()
    sst = pytreebank.load_sst()
    log("Stanford Sentiment Treebank loaded; %s train, %s dev, %s test sentences"
        % (len(sst['train']), len(sst['dev']), len(sst['test'])))
    otrain = open(csv4('train'), 'w', encoding=encoding)
    n1 = csvout(otrain, lts(sst, 'train', fine, neutrals, minlen=minlen), tsv)
    if mergedev:
        log('mergedev')
        n2 = csvout(otrain,
                    lts(sst, 'dev', fine, neutrals, nexti=n1, minlen=minlen),
                    tsv)
        n3 = csvout(otrain,
                    lts(sst, 'test', fine, neutrals, nexti=n2, minlen=minlen),
                    tsv)
        sf = open('%s/%s' % (outdir, 'train-dev-test-ids.txt'), 'w')
        splits = '[0...%s) train ...%s) dev (%s) ...%s) test (%s)' % (
            n1, n2, n2 - n1, n3, n3 - n2)
        log(splits)
        print(splits, file=sf)
        keys = []
    else:
        log('dev, test')
        keys = ['dev', 'test']
    for key in keys:
        csvout(csv4(key), lts(sst, key, fine, neutrals, minlen=minlen), tsv)
示例#13
0
文件: sst.py 项目: ahmedyes2000/gsitk
    def normalize_data(self):
        raw_datapath = os.path.join(self.data_path,
                                    self.info['properties']['data_file'])
        trees_path = os.path.join(self.data_path, 'trainDevTestTrees_PTB')
        if not os.path.isdir(trees_path):
            os.mkdir(trees_path)
        shutil.move(raw_datapath, trees_path)
        stanford_treebank = pytreebank.load_sst(self.data_path)
        train = self.convert_treebank(stanford_treebank['train'], 'train')
        dev = self.convert_treebank(stanford_treebank['dev'], 'dev')
        test = self.convert_treebank(stanford_treebank['test'], 'test')
        data = pd.concat([train, dev, test], ignore_index=True)

        # Remove directory to avoid pytreebank library error
        #shutil.rmtree(raw_datapath)

        # Tokenize and clean the test
        text_data = normalize.normalize_text(data)
        logger.info(data)
        data['text'] = text_data

        return data
def read_files():
    #read the dataset and find the labels
    train_text = []
    train_labels = []
    dev_text = []
    dev_labels = []
    test_text = []
    test_labels = []
    dataset = pytreebank.load_sst()

    for item in dataset["train"]:
        lines = item.to_labeled_lines()
        train_text.append(lines[0][1])
        train_labels.append(lines[0][0])
    for item in dataset["dev"]:
        lines = item.to_labeled_lines()
        dev_text.append(lines[0][1])
        dev_labels.append(lines[0][0])
    for item in dataset["test"]:
        lines = item.to_labeled_lines()
        test_text.append(lines[0][1])
        test_labels.append(lines[0][0])

    return train_text, dev_text, test_text, train_labels, dev_labels, test_labels
示例#15
0
 def load_datasets_treebank():
     '''
     Reference: https://github.com/JonathanRaiman/pytreebank
     '''
     import pytreebank
     treebank_path = "/data/xs/datasets/SentimentTreebank/trainDevTestTrees_PTB/trees"
     dataset = pytreebank.load_sst(treebank_path)
     #train_data = pytreebank.import_tree_corpus("/path/to/sentiment/train.txt")
     example = dataset["train"][0]
     
     # extract spans from the tree.
     X_train = []
     X_test = []
     y_train = []
     y_test = []
     for example in dataset["train"]:
         y_train.append(example.to_labeled_lines()[0][0])
         X_train.append(example.to_labeled_lines()[0][1])
     
     for example in dataset["test"]:
         y_test.append(example.to_labeled_lines()[0][0])
         X_test.append(example.to_labeled_lines()[0][1])       
     
     return [X_train, X_test, y_train, y_test]
示例#16
0
Pencheng Yin <*****@*****.**>
Sahil Chopra <*****@*****.**>
Vera Lin <*****@*****.**>
"""

import math
from typing import List

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytreebank
from model_embeddings import ModelEmbeddings

data = pytreebank.load_sst()


def load_train_data(embed_size=50, perct=1., binary=False):
    '''
    labeledTree.to_labeled_lines()[0] gives you a single sentence and its labeling

    we split it into X = list of words, Y = sentence's labeling

    By default, Y falls into [0, 1, 2, 3, 4]

    @returns: train, dev
        train: List[(List[words], sentiment)] for each sentence in dataset
        dev: ~
    '''
    M = ModelEmbeddings(embed_size=embed_size)
parser = argparse.ArgumentParser()

parser.add_argument(
        "--raw_dataset_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the files of stanfordSentimentTreebank",
    )

parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output dir. all *.tsv files will be here",
    )

args = parser.parse_args()


dataset = pytreebank.load_sst(args.raw_dataset_dir)
out_path = os.path.join(args.output_dir, '{}.txt')

# Store train, dev and test in separate files
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("{}\t{}\n".format(item.to_labeled_lines()[0][0] + 1, item.to_labeled_lines()[0][1]))
# Print the length of the training set
print(len(dataset['train']))
示例#18
0
# Load data
import pytreebank
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt

if __name__ == "__main__":

    out_path = os.path.join(sys.path[0], 'sst_{}.txt')
    dataset = pytreebank.load_sst()

    # I ran the following commented code to get train, dev and test sets

    # Store train, dev and test in separate files
    for category in ['train', 'test', 'dev']:
        with open(out_path.format(category), 'w') as outfile:
            for item in dataset[category]:
                outfile.write("__label__{}\t{}\n".format(
                    item.to_labeled_lines()[0][0] + 1,
                    item.to_labeled_lines()[0][1]
                ))
    # Print the length of the training set
    print(len(dataset['train']))

    # Read train data
    df = pd.read_csv('sst_train.txt', sep='\t', header=None, names=['truth', 'text'], encoding = "ISO-8859-1")
    df['truth'] = df['truth'].str.replace('__label__', '')
    df['truth'] = df['truth'].astype(int).astype('category')
    print(df.head())
示例#19
0
"""
Stanford Sentiment Treebank

From https://github.com/munikarmanish/bert-sentiment/blob/master/bert_sentiment/data.py
"""

import pytreebank
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

sst = pytreebank.load_sst()


def rpad(array, n=70):
    """Right padding."""
    current_len = len(array)
    if current_len > n:
        return array[:n]
    extra = n - current_len
    retval = array + ([0] * extra)

    if len(retval) != n:
        print("ERROR HERE", array)

    return retval


def get_binary_label(label):
示例#20
0
import pytreebank
import vocabulary
import csv
import re

dataset = pytreebank.load_sst('trees/')

# I am training the DCNN for binary classification only,
# and not fine-grained classification. In the Stanford
# Treebank Dataset, ratings left and right of 2 denote
# positive and negative reviews respectively, so I will
# pick only polar reviews and add the corresponding labels.
polarTrainingReivews = []
for example in dataset['train'][:]:
    for newSentence in example.to_labeled_lines():
        label, sentence = newSentence
        if label != 2:
            polarTrainingReivews.append(newSentence)

polarValidationReivews = []
for example in dataset['dev'][:]:
    newSentence = example.to_labeled_lines()[0]
    label, sentence = newSentence
    if label != 2:
        polarValidationReivews.append(newSentence)

polarTestReivews = []
for example in dataset['test'][:]:
    newSentence = example.to_labeled_lines()[0]
    label, sentence = newSentence
    if label != 2:
import sys
import os,io
import pytreebank

out_path = os.path.join('../inputs/', 'sst_{}.txt')
dataset = pytreebank.load_sst('../inputs')

# Store train, dev and test in separate files
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("{}\t{}\n".format(
                item.to_labeled_lines()[0][0],
                item.to_labeled_lines()[0][1]
            ))
示例#22
0
def preprocess_full(vocabulary_size):
    trees = pytreebank.load_sst('trees')
    trees_train = trees["train"]
    trees_dev = trees["dev"]
    trees_test = trees["test"]
示例#23
0
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Transform original SSTb file into json format using pytreebank parser"""

# -*- coding: utf-8 -*-

import gzip
import json
# path that saves
import sys

import pytreebank

path = sys.argv[1]
dataset = pytreebank.load_sst(path + "trees/")

train_data = dataset['train']
dev_data = dataset['dev']
test_data = dataset['test']

train_list = []
dev_list = []
test_list = []

index = 0

for data in train_data:
    dic = dict()
    dic['label'], dic['text'] = data.to_labeled_lines()[0]
    dic['index'] = index
示例#24
0
        if line != '':
            columns = line.split('$')

            if columns[0] == '' or columns[1] == '':
                raise RuntimeError("1111111111")

            out_fp.write(columns[0]+"||"+columns[1]+"\n")

    in_fp.close()
    out_fp.close()


if __name__ == '__main__':
    # test_elmoformanylangs()

    # transfer_meddra_to_multi_seive_dict_format('/Users/feili/resource/meddra/meddra_20_1_english/MedAscii/pt.asc',
    #                                            '/Users/feili/PycharmProjects/norm/meddra_dict.txt')

    import pytreebank

    dataset = pytreebank.load_sst('/Users/feili/dataset/sst/trees')
    example = dataset["train"][0]

    # extract spans from the tree.
    for label, sentence in example.to_labeled_lines():
        print("%s has sentiment label %s" % (
            sentence,
            ["very negative", "negative", "neutral", "positive", "very positive"][label]
        ))

示例#25
0
# Load data
import pytreebank
import sys
import os

out_path = os.path.join(sys.path[0], 'sst_{}.txt')
dataset = pytreebank.load_sst('./raw_data')

# Store train, dev and test in separate files
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] + 1,
                item.to_labeled_lines()[0][1]))
# Print the length of the training set
print(len(dataset['train']))
示例#26
0
import pytreebank
import sys
import os

out_path = os.path.join(sys.path[0], 'sst_{}.txt')
dataset = pytreebank.load_sst('./trees')

# Store train, dev and test in separate files
for category in ['train', 'test', 'dev']:
    with open(out_path.format(category), 'w') as outfile:
        for item in dataset[category]:
            outfile.write("__label__{}\t{}\n".format(
                item.to_labeled_lines()[0][0] + 1,
                item.to_labeled_lines()[0][1]))
# Print the length of the training set
print(len(dataset['train']))