def main(): verify_cwd() if not os.path.exists("./data/"): print("creating folder...") os.mkdir("./data/") if not os.path.exists("./data/trees/"): print("preparing sentiment treebank...") try: pytreebank.load_sst("./data/") except: pass # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored. if not os.path.exists("./data/text8.zip"): print("retrieving text8...") urllib.request.urlretrieve("http://mattmahoney.net/dc/text8.zip", "./data/text8.zip") if not os.path.exists("./data/text8"): print("extracting text8...") with zipfile.ZipFile("./data/text8.zip", "r") as zip_ref: zip_ref.extractall("./data/") if not os.path.exists("./data/word2vec.model"): print("training word2vec...") train_word2vec() #if not os.path.exists("./data/glove.model"): glove training is slow. You should call it manually on create_pretrain_model.py # print("training glove...") # train_glove() print("=== ALL CLEAR! ===")
def main(): verify_cwd() try: pytreebank.load_sst("./data/") except: pass # pytreebank downloader seems not robust under windows env. Actually we just want the data and the parser, so ignored. train_data = pytreebank.import_tree_corpus("./data/trees/train.txt") assert (str(train_data[0]) == TARGET_STRING), "test fail for pytreebank." print("Correctness verified.")
def build_vocab( self, cut_off ): #this is basically useless, since glove already has the most common words word_count = {} data = pytreebank.load_sst(self.path) for phrase in data['train']: phrase.lowercase() _, sentence = phrase.to_labeled_lines()[0] for word in sentence.split(): #check if stop word and ignore if word in word_count: word_count[word] += 1 else: word_count[word] = 1 filter_word_count = [ word for word, count in word_count.items() if count < cut_off ] word2id = {word: i + 1 for i, word in enumerate(filter_word_count)} pickle.dump(word2id, open(self.word2id_path, 'wb')) return word2id
def preprocess_bert(partition='train'): sst_data = pytreebank.load_sst() # Load the dataset and vectorize it train_set = sst_data[partition] x_list, y_list = tree_to_bert(train_set) print("All " + partition + " samples, w/o filtering " + str(len(x_list))) # Filter for min sentence length # Setting the min length to 5, this is the 3rd percentile # Also, it makes sense for the sentences to be at least 5 words x_filtered, y_filtered = filter_minlength(x_list, y_list, min_length=5) print("All " + partition + " samples, w/ filtering " + str(len(x_filtered))) # Pad sequences to same length # Max length for the training dataset is 23, so padding to 25 to make sure x_padded = pad_sequences(x_filtered, maxlen=25, dtype='float32', padding='post') print(x_padded.shape) # One-hot encode labels. This is necessary to use loss=categorical_crossentropy for training y_onehot = to_categorical(y_filtered, num_classes=5) # Save it all to .npy files np.save('data/x_' + partition + '_bert', x_padded) np.save('data/y_' + partition + '_bert', y_onehot)
def load_examples_sst(input_directory, curr_set="dev", granularity="binary"): """ Loads the Stanford Sentiment Treebank with binary / fine-grained labels :param input_directory: the dataset directory :param curr_set: train, test, or dev :param granularity: binary / fine :return: a list of examples """ # Load dataset dataset = pytreebank.load_sst(input_directory) examples = [] for item in dataset[curr_set]: root = item.to_labeled_lines()[0] fine_grained_label, sentence = root # Detokenize the sentence, i.e. remove trailing spaces etc. premise = detokenizer(sentence.strip()) if not premise.endswith("."): premise = premise + "." # Fine grained if granularity == "fine": opts = [ " very negative.", " somewhat negative.", " neutral.", " somewhat positive.", " very positive." ] label = fine_grained_label # Binary else: # Omit neutral examples from the binary version if fine_grained_label == 2: continue opts = [" negative.", " positive."] label = 0 if fine_grained_label < 2 else 1 #premise = f"\"{d['sentence']}\" has a tone that is" premise = f"\"{premise}\" has a tone that is" options = [] for h in opts: o = {} h = h + '<|endoftext|>' o['premise'] = premise o['hypothesis'] = h o['uncond_premise'] = ' The quote has a tone that is' o['uncond_hypothesis'] = h options.append(o) label = label examples.append({'options': options, 'label': label}) return examples
def load_from_file(path): """ :param path: path to trees folder :return: """ trees = pytreebank.load_sst(path) raw_train = trees["train"] raw_dev = trees["dev"] raw_test = trees["test"] return raw_train, raw_dev, raw_test
def process(self, type, word2id): data = pytreebank.load_sst(self.path) train, labels, max_sentence_size = self._phrase2id(data[type], word2id) t_data, t_labels = self._create_torch_training(train, labels, max_sentence_size) torch.save(t_data, self.data_path) torch.save(t_labels, self.labels_path) return t_data, t_labels
def load_dataset(self): """ Load the dataset into memory. Returns: -------- dict: loaded dataset dictionary """ with mock.patch.object(utils, 'urlretrieve') as mock_urlretrieve: mock_urlretrieve.side_effect = add_fake_zip_file dataset = load_sst(self.temp_dir_save_path) return dataset
def main(): parser = argparse.ArgumentParser() parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--hidden_size", type=int, default=256) args = parser.parse_args() hps = HParams( nhidden=args.hidden_size, nembd=64, nbatch=args.batch_size, nstates=2, nvocab=256, out_wn=False, rnn_wn=True, rnn_type='mlstm', embd_wn=True, ) # params = [np.load('model/%d.npy' % i) for i in range(15)] # params[2] = np.concatenate(params[2:6], axis=1) # params[3:6] = [] X = tf.placeholder(tf.int32, [None, None]) Y = tf.placeholder(tf.int32, [None, None]) mask = tf.placeholder(tf.float32, [None, None]) cells, states, logits = model(hps, X, reuse=False) loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=Y) loss = loss * mask mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(mask) train_op = tf.train.GradientDescentOptimizer(0.01).minimize(mean_loss) loss = tf.reduce_sum(loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession() tf.global_variables_initializer().run(session=sess) # load some data loaded_dataset = pytreebank.load_sst(join(DATA_DIR, "sst")) # labels = np.array([label for label, _ in text_data]) text = np.array([list(preprocess(ex.to_lines()[0])) for ex in loaded_dataset['train']]) batches_per_epoch = int(np.ceil(len(text) / args.batch_size)) for epoch in range(args.epochs): t0 = time.time() epoch_loss = 0.0 for i in tqdm.tqdm(range(batches_per_epoch)): batch_indices = np.random.choice(len(text), size=args.batch_size) x, y, batch_mask = batchify(text[batch_indices]) _, batch_cost = sess.run((train_op, loss), {X: x, Y: y, mask: batch_mask}) epoch_loss += batch_cost t1 = time.time() print("%.3f\t%.3f" % (t1 - t0, epoch_loss))
def _load_data(self) -> typing.Dict[str, typing.Any]: if self.data is None: self.data: dict = pytreebank.load_sst() pdframes: dict = {} for k_from, k_to in dict(train='train', dev='validation', test='test').items(): labels, sentences = [], [] for labeled_tree_obj in self.data[k_from]: lab, sent = labeled_tree_obj.to_labeled_lines()[0] labels += [lab] sentences += [sent] pdframes[k_to] = pd.DataFrame( dict(sentence=sentences, label=labels)) pdframes[k_to] = self._preprocess(pdframes[k_to]) self.data = pdframes return self.data
def main(args): logger.info('loading SST data') dataset = pytreebank.load_sst(os.path.join(DATASET, 'sst')) split = ['train', 'dev', 'test'] data = defaultdict(set) total_samples = 0 for tag in split: for example in dataset[tag]: for label, sentence in example.to_labeled_lines(): sentence = clean_sst(sentence, uncased=args.uncased) if args.n_class == 5: if (sentence, label) not in data[tag] and sentence: data[tag].add((sentence, label)) else: if label > 2: if (sentence, 1) not in data[tag] and sentence: data[tag].add((sentence, 1)) elif label < 2: if (sentence, 0) not in data[tag] and sentence: data[tag].add((sentence, 0)) if args.phrase and tag == 'train': continue else: break logger.info('-' * 100) logger.info( f'{tag}: {len(dataset[tag])} sentences generates {len(data[tag])} examples.' ) total_samples += len(data[tag]) logger.info('calculate sentence statistics') data[tag] = list(data[tag]) for i, (sent, label) in enumerate(data[tag]): data[tag][i] = (sent.split(), label) stat([sent for sent, label in data[tag]]) sent_file = os.path.join(DATASET, 'sst', f'sst{args.n_class}_{tag}.json') logger.info(f'write samples to {sent_file}') with open(sent_file, 'w') as f: for sent, label in data[tag]: line = json.dumps({'label': label, 'sent': sent}) f.write(line + '\n') logger.info('-' * 100)
def main(outdir="SST", fine=False, mergedev=False, encoding='utf-8', minlen=2, neutrals=False, tsv=False): path = Path(outdir) csvext = '.csv' if tsv: csvext = '.tsv' def csv4(x): r = '%s/%s%s' % (outdir, x, csvext) log(r) return r if not path.exists(): path.mkdir() sst = pytreebank.load_sst() log("Stanford Sentiment Treebank loaded; %s train, %s dev, %s test sentences" % (len(sst['train']), len(sst['dev']), len(sst['test']))) otrain = open(csv4('train'), 'w', encoding=encoding) n1 = csvout(otrain, lts(sst, 'train', fine, neutrals, minlen=minlen), tsv) if mergedev: log('mergedev') n2 = csvout(otrain, lts(sst, 'dev', fine, neutrals, nexti=n1, minlen=minlen), tsv) n3 = csvout(otrain, lts(sst, 'test', fine, neutrals, nexti=n2, minlen=minlen), tsv) sf = open('%s/%s' % (outdir, 'train-dev-test-ids.txt'), 'w') splits = '[0...%s) train ...%s) dev (%s) ...%s) test (%s)' % ( n1, n2, n2 - n1, n3, n3 - n2) log(splits) print(splits, file=sf) keys = [] else: log('dev, test') keys = ['dev', 'test'] for key in keys: csvout(csv4(key), lts(sst, key, fine, neutrals, minlen=minlen), tsv)
def normalize_data(self): raw_datapath = os.path.join(self.data_path, self.info['properties']['data_file']) trees_path = os.path.join(self.data_path, 'trainDevTestTrees_PTB') if not os.path.isdir(trees_path): os.mkdir(trees_path) shutil.move(raw_datapath, trees_path) stanford_treebank = pytreebank.load_sst(self.data_path) train = self.convert_treebank(stanford_treebank['train'], 'train') dev = self.convert_treebank(stanford_treebank['dev'], 'dev') test = self.convert_treebank(stanford_treebank['test'], 'test') data = pd.concat([train, dev, test], ignore_index=True) # Remove directory to avoid pytreebank library error #shutil.rmtree(raw_datapath) # Tokenize and clean the test text_data = normalize.normalize_text(data) logger.info(data) data['text'] = text_data return data
def read_files(): #read the dataset and find the labels train_text = [] train_labels = [] dev_text = [] dev_labels = [] test_text = [] test_labels = [] dataset = pytreebank.load_sst() for item in dataset["train"]: lines = item.to_labeled_lines() train_text.append(lines[0][1]) train_labels.append(lines[0][0]) for item in dataset["dev"]: lines = item.to_labeled_lines() dev_text.append(lines[0][1]) dev_labels.append(lines[0][0]) for item in dataset["test"]: lines = item.to_labeled_lines() test_text.append(lines[0][1]) test_labels.append(lines[0][0]) return train_text, dev_text, test_text, train_labels, dev_labels, test_labels
def load_datasets_treebank(): ''' Reference: https://github.com/JonathanRaiman/pytreebank ''' import pytreebank treebank_path = "/data/xs/datasets/SentimentTreebank/trainDevTestTrees_PTB/trees" dataset = pytreebank.load_sst(treebank_path) #train_data = pytreebank.import_tree_corpus("/path/to/sentiment/train.txt") example = dataset["train"][0] # extract spans from the tree. X_train = [] X_test = [] y_train = [] y_test = [] for example in dataset["train"]: y_train.append(example.to_labeled_lines()[0][0]) X_train.append(example.to_labeled_lines()[0][1]) for example in dataset["test"]: y_test.append(example.to_labeled_lines()[0][0]) X_test.append(example.to_labeled_lines()[0][1]) return [X_train, X_test, y_train, y_test]
Pencheng Yin <*****@*****.**> Sahil Chopra <*****@*****.**> Vera Lin <*****@*****.**> """ import math from typing import List import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import pytreebank from model_embeddings import ModelEmbeddings data = pytreebank.load_sst() def load_train_data(embed_size=50, perct=1., binary=False): ''' labeledTree.to_labeled_lines()[0] gives you a single sentence and its labeling we split it into X = list of words, Y = sentence's labeling By default, Y falls into [0, 1, 2, 3, 4] @returns: train, dev train: List[(List[words], sentiment)] for each sentence in dataset dev: ~ ''' M = ModelEmbeddings(embed_size=embed_size)
parser = argparse.ArgumentParser() parser.add_argument( "--raw_dataset_dir", default=None, type=str, required=True, help="The input data dir. Should contain the files of stanfordSentimentTreebank", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output dir. all *.tsv files will be here", ) args = parser.parse_args() dataset = pytreebank.load_sst(args.raw_dataset_dir) out_path = os.path.join(args.output_dir, '{}.txt') # Store train, dev and test in separate files for category in ['train', 'test', 'dev']: with open(out_path.format(category), 'w') as outfile: for item in dataset[category]: outfile.write("{}\t{}\n".format(item.to_labeled_lines()[0][0] + 1, item.to_labeled_lines()[0][1])) # Print the length of the training set print(len(dataset['train']))
# Load data import pytreebank import sys import os import pandas as pd import matplotlib.pyplot as plt if __name__ == "__main__": out_path = os.path.join(sys.path[0], 'sst_{}.txt') dataset = pytreebank.load_sst() # I ran the following commented code to get train, dev and test sets # Store train, dev and test in separate files for category in ['train', 'test', 'dev']: with open(out_path.format(category), 'w') as outfile: for item in dataset[category]: outfile.write("__label__{}\t{}\n".format( item.to_labeled_lines()[0][0] + 1, item.to_labeled_lines()[0][1] )) # Print the length of the training set print(len(dataset['train'])) # Read train data df = pd.read_csv('sst_train.txt', sep='\t', header=None, names=['truth', 'text'], encoding = "ISO-8859-1") df['truth'] = df['truth'].str.replace('__label__', '') df['truth'] = df['truth'].astype(int).astype('category') print(df.head())
""" Stanford Sentiment Treebank From https://github.com/munikarmanish/bert-sentiment/blob/master/bert_sentiment/data.py """ import pytreebank import torch from transformers import BertTokenizer from torch.utils.data import Dataset tokenizer = BertTokenizer.from_pretrained("bert-large-uncased") sst = pytreebank.load_sst() def rpad(array, n=70): """Right padding.""" current_len = len(array) if current_len > n: return array[:n] extra = n - current_len retval = array + ([0] * extra) if len(retval) != n: print("ERROR HERE", array) return retval def get_binary_label(label):
import pytreebank import vocabulary import csv import re dataset = pytreebank.load_sst('trees/') # I am training the DCNN for binary classification only, # and not fine-grained classification. In the Stanford # Treebank Dataset, ratings left and right of 2 denote # positive and negative reviews respectively, so I will # pick only polar reviews and add the corresponding labels. polarTrainingReivews = [] for example in dataset['train'][:]: for newSentence in example.to_labeled_lines(): label, sentence = newSentence if label != 2: polarTrainingReivews.append(newSentence) polarValidationReivews = [] for example in dataset['dev'][:]: newSentence = example.to_labeled_lines()[0] label, sentence = newSentence if label != 2: polarValidationReivews.append(newSentence) polarTestReivews = [] for example in dataset['test'][:]: newSentence = example.to_labeled_lines()[0] label, sentence = newSentence if label != 2:
import sys import os,io import pytreebank out_path = os.path.join('../inputs/', 'sst_{}.txt') dataset = pytreebank.load_sst('../inputs') # Store train, dev and test in separate files for category in ['train', 'test', 'dev']: with open(out_path.format(category), 'w') as outfile: for item in dataset[category]: outfile.write("{}\t{}\n".format( item.to_labeled_lines()[0][0], item.to_labeled_lines()[0][1] ))
def preprocess_full(vocabulary_size): trees = pytreebank.load_sst('trees') trees_train = trees["train"] trees_dev = trees["dev"] trees_test = trees["test"]
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= """Transform original SSTb file into json format using pytreebank parser""" # -*- coding: utf-8 -*- import gzip import json # path that saves import sys import pytreebank path = sys.argv[1] dataset = pytreebank.load_sst(path + "trees/") train_data = dataset['train'] dev_data = dataset['dev'] test_data = dataset['test'] train_list = [] dev_list = [] test_list = [] index = 0 for data in train_data: dic = dict() dic['label'], dic['text'] = data.to_labeled_lines()[0] dic['index'] = index
if line != '': columns = line.split('$') if columns[0] == '' or columns[1] == '': raise RuntimeError("1111111111") out_fp.write(columns[0]+"||"+columns[1]+"\n") in_fp.close() out_fp.close() if __name__ == '__main__': # test_elmoformanylangs() # transfer_meddra_to_multi_seive_dict_format('/Users/feili/resource/meddra/meddra_20_1_english/MedAscii/pt.asc', # '/Users/feili/PycharmProjects/norm/meddra_dict.txt') import pytreebank dataset = pytreebank.load_sst('/Users/feili/dataset/sst/trees') example = dataset["train"][0] # extract spans from the tree. for label, sentence in example.to_labeled_lines(): print("%s has sentiment label %s" % ( sentence, ["very negative", "negative", "neutral", "positive", "very positive"][label] ))
# Load data import pytreebank import sys import os out_path = os.path.join(sys.path[0], 'sst_{}.txt') dataset = pytreebank.load_sst('./raw_data') # Store train, dev and test in separate files for category in ['train', 'test', 'dev']: with open(out_path.format(category), 'w') as outfile: for item in dataset[category]: outfile.write("__label__{}\t{}\n".format( item.to_labeled_lines()[0][0] + 1, item.to_labeled_lines()[0][1])) # Print the length of the training set print(len(dataset['train']))
import pytreebank import sys import os out_path = os.path.join(sys.path[0], 'sst_{}.txt') dataset = pytreebank.load_sst('./trees') # Store train, dev and test in separate files for category in ['train', 'test', 'dev']: with open(out_path.format(category), 'w') as outfile: for item in dataset[category]: outfile.write("__label__{}\t{}\n".format( item.to_labeled_lines()[0][0] + 1, item.to_labeled_lines()[0][1])) # Print the length of the training set print(len(dataset['train']))