Exemplo n.º 1
0
from torchnlp.text_encoders import WhitespaceEncoder, IdentityEncoder
from torchnlp import word_to_vector

from model import SNLIClassifier
from util import get_args, makedirs, collate_fn

args = get_args()

if args.gpu >= 0:
    torch.cuda.set_device(args.gpu)

# load dataset
train, dev, test = snli_dataset(train=True, dev=True, test=True)

# Preprocess
for row in datasets_iterator(train, dev, test):
    row['premise'] = row['premise'].lower()
    row['hypothesis'] = row['hypothesis'].lower()

# Make Encoders
sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)]
sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)]
label_encoder = IdentityEncoder(label_corpus)

# Encode
for row in datasets_iterator(train, dev, test):
    row['premise'] = sentence_encoder.encode(row['premise'])
    row['hypothesis'] = sentence_encoder.encode(row['hypothesis'])
Exemplo n.º 2
0
    def __init__(self,
                 root: str,
                 normal_class=0,
                 tokenizer='spacy',
                 use_tfidf_weights=False,
                 append_sos=False,
                 append_eos=False,
                 clean_txt=False,
                 max_seq_len_prior=None):
        super().__init__(root)

        self.n_classes = 2  # 0: normal, 1: outlier
        classes = [
            'earn', 'acq', 'crude', 'trade', 'money-fx', 'interest', 'ship'
        ]

        # classes_full_list = [
        #     'acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee',
        #     'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk',
        #     'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing',
        #     'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei',
        #     'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel',
        #     'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum',
        #     'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye',
        #     'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal',
        #     'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc'
        # ]

        self.normal_classes = [classes[normal_class]]
        del classes[normal_class]
        self.outlier_classes = classes

        # Load the reuters dataset
        self.train_set, self.test_set = reuters_dataset(directory=root,
                                                        train=True,
                                                        test=True,
                                                        clean_txt=clean_txt)

        # Pre-process
        self.train_set.columns.add('index')
        self.test_set.columns.add('index')
        self.train_set.columns.add('weight')
        self.test_set.columns.add('weight')

        train_idx_normal = []  # for subsetting train_set to normal class
        for i, row in enumerate(self.train_set):
            if any(label in self.normal_classes
                   for label in row['label']) and (len(row['label']) == 1):
                train_idx_normal.append(i)
                row['label'] = torch.tensor(0)
            else:
                row['label'] = torch.tensor(1)
            row['text'] = row['text'].lower()

        test_idx = [
        ]  # for subsetting test_set to selected normal and anomalous classes
        test_n_idx = []  # subsetting test_set to selected normal classes
        test_a_idx = []  # subsetting test_set to selected anomalous classes
        for i, row in enumerate(self.test_set):
            if any(label in self.normal_classes
                   for label in row['label']) and (len(row['label']) == 1):
                test_idx.append(i)
                test_n_idx.append(i)
                row['label'] = torch.tensor(0)
            elif any(label in self.outlier_classes
                     for label in row['label']) and (len(row['label']) == 1):
                test_idx.append(i)
                test_a_idx.append(i)
                row['label'] = torch.tensor(1)
            else:
                row['label'] = torch.tensor(1)
            row['text'] = row['text'].lower()

        # Subset train_set to normal class
        self.train_set = Subset(self.train_set, train_idx_normal)
        # Subset test_set to selected normal and anomalous classes
        self.test_set = Subset(self.test_set, test_idx)
        # Subset test_set to selected normal classes
        self.test_n_set = Subset(self.test_set, test_n_idx)
        # Subset test_set to selected anomalous classes
        self.test_a_set = Subset(self.test_set, test_a_idx)

        # Make corpus and set encoder
        text_corpus = [
            row['text']
            for row in datasets_iterator(self.train_set, self.test_set)
        ]
        if tokenizer == 'spacy':
            self.encoder = SpacyEncoder(text_corpus,
                                        min_occurrences=3,
                                        append_eos=append_eos)
        if tokenizer == 'bert':
            self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased',
                                                           cache_dir=root)

        # Encode
        self.max_seq_len = 0
        for row in datasets_iterator(self.train_set, self.test_set):
            if append_sos:
                sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN]
                row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0),
                                         self.encoder.encode(row['text'])))
            else:
                row['text'] = self.encoder.encode(row['text'])
            if len(row['text']) > self.max_seq_len:
                self.max_seq_len = len(row['text'])

        # Compute tf-idf weights
        if use_tfidf_weights:
            compute_tfidf_weights(self.train_set,
                                  self.test_set,
                                  vocab_size=self.encoder.vocab_size)
        else:
            for row in datasets_iterator(self.train_set, self.test_set):
                row['weight'] = torch.empty(0)

        # Get indices after pre-processing
        for i, row in enumerate(self.train_set):
            row['index'] = i
        for i, row in enumerate(self.test_set):
            row['index'] = i

        # # length prior
        sent_lengths = [len(row['text']) for row in self.train_set]
        sent_lengths_freq = np.bincount(np.array(sent_lengths))
        sent_lengths_freq = np.concatenate(
            (sent_lengths_freq,
             np.array((max_seq_len_prior - max(sent_lengths)) * [0])),
            axis=0)
        sent_lengths_freq = sent_lengths_freq + 1
        self.length_prior = np.log(sent_lengths_freq) - np.log(
            sent_lengths_freq.sum())
Exemplo n.º 3
0
def load_data(data_type,
              preprocessing=False,
              fine_grained=False,
              verbose=False,
              text_length=5000,
              encode=True,
              load_SLE=False):
    if data_type == 'imdb':
        train_data, test_data = imdb_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'newsgroups':
        train_data, test_data = newsgroups_dataset(preprocessing=preprocessing,
                                                   verbose=verbose,
                                                   text_length=text_length)
    elif data_type == 'reuters':
        train_data, test_data = reuters_dataset(preprocessing=preprocessing,
                                                fine_grained=fine_grained,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'webkb':
        train_data, test_data = webkb_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'cade':
        train_data, test_data = cade_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'dbpedia':
        train_data, test_data = dbpedia_dataset(preprocessing=preprocessing,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'agnews':
        train_data, test_data = agnews_dataset(preprocessing=preprocessing,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'yahoo':
        train_data, test_data = yahoo_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'sogou':
        train_data, test_data = sogou_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'yelp':
        train_data, test_data = yelp_dataset(preprocessing=preprocessing,
                                             fine_grained=fine_grained,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'amazon':
        train_data, test_data = amazon_dataset(preprocessing=preprocessing,
                                               fine_grained=fine_grained,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'custom':
        test_data = custom_dataset(preprocessing=preprocessing,
                                   fine_grained=fine_grained,
                                   verbose=verbose,
                                   text_length=text_length)
        sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb'))
        label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        for row in datasets_iterator(test_data):
            row['text'] = sentence_encoder.encode(' '.join(row['text']))
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data
    else:
        raise ValueError('{} data type not supported.'.format(data_type))

    if encode:
        if load_SLE:
            sentence_encoder = pickle.load(
                open('epochs/sentence_encoder', 'rb'))
            label_encoder = pickle.load(open('epochs/label_encoder', 'rb'))
        else:
            sentence_corpus = [
                row['text'] for row in datasets_iterator(train_data, )
            ]
            label_corpus = [
                row['label'] for row in datasets_iterator(train_data, )
            ]
            sentence_encoder = WhitespaceEncoder(
                sentence_corpus,
                reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN])
            label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[])
            with open('epochs/sentence_encoder', 'wb') as f:
                pickle.dump(sentence_encoder, f)
            with open('epochs/label_encoder', 'wb') as f:
                pickle.dump(label_encoder, f)

        # Encode
        for row in datasets_iterator(train_data, test_data):
            row['text'] = sentence_encoder.encode(row['text'])
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data
    else:
        return train_data, test_data
Exemplo n.º 4
0
    def __init__(self,
                 root: str,
                 normal_class=0,
                 tokenizer='spacy',
                 use_tfidf_weights=False,
                 append_sos=False,
                 append_eos=False,
                 clean_txt=False,
                 max_seq_len_prior=None):
        super().__init__(root)

        self.n_classes = 2  # 0: normal, 1: outlier
        classes = ['pos', 'neg']

        if normal_class == -1:
            self.normal_classes = classes
            self.outlier_classes = []
        else:
            self.normal_classes = [classes[normal_class]]
            del classes[normal_class]
            self.outlier_classes = classes

        if root not in nltk.data.path:
            nltk.data.path.append(root)
        # Load the imdb dataset
        self.train_set, self.test_set = imdb_dataset(directory=root,
                                                     train=True,
                                                     test=True)

        # Pre-process
        self.train_set.columns.add('index')
        self.test_set.columns.add('index')
        self.train_set.columns.remove('sentiment')
        self.test_set.columns.remove('sentiment')
        self.train_set.columns.add('label')
        self.test_set.columns.add('label')
        self.train_set.columns.add('weight')
        self.test_set.columns.add('weight')

        train_idx_normal = []  # for subsetting train_set to normal class
        for i, row in enumerate(self.train_set):
            row['label'] = row.pop('sentiment')
            if row['label'] in self.normal_classes:
                train_idx_normal.append(i)
                row['label'] = torch.tensor(0)
            else:
                row['label'] = torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        test_n_idx = []  # subsetting test_set to selected normal classes
        test_a_idx = []  # subsetting test_set to selected anomalous classes
        for i, row in enumerate(self.test_set):
            row['label'] = row.pop('sentiment')
            if row['label'] in self.normal_classes:
                test_n_idx.append(i)
            else:
                test_a_idx.append(i)
            row['label'] = torch.tensor(
                0) if row['label'] in self.normal_classes else torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        # Subset train_set to normal class
        self.train_set = Subset(self.train_set, train_idx_normal)
        # Subset test_set to selected normal classes
        self.test_n_set = Subset(self.test_set, test_n_idx)
        # Subset test_set to selected anomalous classes
        self.test_a_set = Subset(self.test_set, test_a_idx)

        # Make corpus and set encoder
        text_corpus = [
            row['text']
            for row in datasets_iterator(self.train_set, self.test_set)
        ]
        if tokenizer == 'spacy':
            self.encoder = SpacyEncoder(text_corpus,
                                        min_occurrences=3,
                                        append_eos=append_eos)
        if tokenizer == 'bert':
            self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased',
                                                           cache_dir=root)

        # Encode
        self.max_seq_len = 0
        for row in datasets_iterator(self.train_set, self.test_set):
            if append_sos:
                sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN]
                row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0),
                                         self.encoder.encode(row['text'])))
            else:
                row['text'] = self.encoder.encode(row['text'])
            if len(row['text']) > self.max_seq_len:
                self.max_seq_len = len(row['text'])

        # Compute tf-idf weights
        if use_tfidf_weights:
            compute_tfidf_weights(self.train_set,
                                  self.test_set,
                                  vocab_size=self.encoder.vocab_size)
        else:
            for row in datasets_iterator(self.train_set, self.test_set):
                row['weight'] = torch.empty(0)

        # Get indices after pre-processing
        for i, row in enumerate(self.train_set):
            row['index'] = i
        for i, row in enumerate(self.test_set):
            row['index'] = i

        # length prior
        sent_lengths = [len(row['text']) for row in self.train_set]
        sent_lengths_freq = np.bincount(np.array(sent_lengths))
        sent_lengths_freq = np.concatenate(
            (sent_lengths_freq,
             np.array((max_seq_len_prior - max(sent_lengths)) * [0])),
            axis=0)
        sent_lengths_freq = sent_lengths_freq + 1
        self.length_prior = np.log(sent_lengths_freq) - np.log(
            sent_lengths_freq.sum())
Exemplo n.º 5
0
    def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False,
                 append_eos=False, clean_txt=False):
        super().__init__(root)

        self.n_classes = 2  # 0: normal, 1: outlier
        classes = ['pos', 'neg']

        if normal_class == -1:
            self.normal_classes = classes
            self.outlier_classes = []
        else:
            self.normal_classes = [classes[normal_class]]
            del classes[normal_class]
            self.outlier_classes = classes

        # Load the imdb dataset
        self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True)

        # Pre-process
        self.train_set.columns.add('index')
        self.test_set.columns.add('index')
        self.train_set.columns.remove('sentiment')
        self.test_set.columns.remove('sentiment')
        self.train_set.columns.add('label')
        self.test_set.columns.add('label')
        self.train_set.columns.add('weight')
        self.test_set.columns.add('weight')

        train_idx_normal = []  # for subsetting train_set to normal class
        for i, row in enumerate(self.train_set):
            row['label'] = row.pop('sentiment')
            if row['label'] in self.normal_classes:
                train_idx_normal.append(i)
                row['label'] = torch.tensor(0)
            else:
                row['label'] = torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        for i, row in enumerate(self.test_set):
            row['label'] = row.pop('sentiment')
            row['label'] = torch.tensor(0) if row['label'] in self.normal_classes else torch.tensor(1)
            if clean_txt:
                row['text'] = clean_text(row['text'].lower())
            else:
                row['text'] = row['text'].lower()

        # Subset train_set to normal class
        self.train_set = Subset(self.train_set, train_idx_normal)

        # Make corpus and set encoder
        text_corpus = [row['text'] for row in datasets_iterator(self.train_set, self.test_set)]
        if tokenizer == 'spacy':
            self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos)
        if tokenizer == 'bert':
            self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root)

        # Encode
        for row in datasets_iterator(self.train_set, self.test_set):
            if append_sos:
                sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN]
                row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text'])))
            else:
                row['text'] = self.encoder.encode(row['text'])

        # Compute tf-idf weights
        if use_tfidf_weights:
            compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size)
        else:
            for row in datasets_iterator(self.train_set, self.test_set):
                row['weight'] = torch.empty(0)

        # Get indices after pre-processing
        for i, row in enumerate(self.train_set):
            row['index'] = i
        for i, row in enumerate(self.test_set):
            row['index'] = i
Exemplo n.º 6
0
from torchnlp.text_encoders import WhitespaceEncoder, IdentityEncoder
from torchnlp import word_to_vector

from model import SNLIClassifier
from util import get_args, makedirs, collate_fn

args = get_args()

if args.gpu >= 0:
    torch.cuda.set_device(args.gpu)

# load dataset
train, dev, test = snli_dataset(train=True, dev=True, test=True)

# Preprocess
for row in datasets_iterator(train, dev, test):
    row['premise'] = row['premise'].lower()
    row['hypothesis'] = row['hypothesis'].lower()

# Make Encoders
sentence_corpus = [
    row['premise'] for row in datasets_iterator(train, dev, test)
]
sentence_corpus += [
    row['hypothesis'] for row in datasets_iterator(train, dev, test)
]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)]
label_encoder = IdentityEncoder(label_corpus)
Exemplo n.º 7
0
    def __init__(self,
                 root: str,
                 normal_class=0,
                 tokenizer='spacy',
                 use_tfidf_weights=False,
                 append_sos=False,
                 append_eos=False,
                 clean_txt=False,
                 max_seq_len_prior=None):
        super().__init__(root)

        self.n_classes = 2  # 0: normal, 1: outlier
        classes = list(range(6))

        groups = [[
            'comp.graphics', 'comp.os.ms-windows.misc',
            'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
            'comp.windows.x'
        ],
                  [
                      'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
                      'rec.sport.hockey'
                  ], ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'],
                  ['misc.forsale'],
                  [
                      'talk.politics.misc', 'talk.politics.guns',
                      'talk.politics.mideast'
                  ],
                  [
                      'talk.religion.misc', 'alt.atheism',
                      'soc.religion.christian'
                  ]]
        short_group_names = ['comp', 'rec', 'sci', 'misc', 'pol', 'rel']
        self.subset = short_group_names[normal_class]

        self.normal_classes = groups[normal_class]
        self.outlier_classes = []
        del classes[normal_class]
        for i in classes:
            self.outlier_classes += groups[i]

        # Load the 20 Newsgroups dataset
        self.train_set, self.test_set = newsgroups20_dataset(
            directory=root,
            train=True,
            test=True,
            clean_txt=clean_txt,
            groups=groups,
            short_group_names=short_group_names)

        # Pre-process
        self.train_set.columns.add('index')
        self.test_set.columns.add('index')
        self.train_set.columns.add('weight')
        self.test_set.columns.add('weight')

        train_idx_normal = []  # for subsetting train_set to normal class
        for i, row in enumerate(self.train_set):
            if row['label'] in self.normal_classes:
                train_idx_normal.append(i)
                row['label'] = torch.tensor(0)
            else:
                row['label'] = torch.tensor(1)
            row['text'] = row['text'].lower()

        test_n_idx = []  # subsetting test_set to selected normal classes
        test_a_idx = []  # subsetting test_set to selected anomalous classes
        for i, row in enumerate(self.test_set):
            if row['label'] in self.normal_classes:
                test_n_idx.append(i)
            else:
                test_a_idx.append(i)
            row['label'] = torch.tensor(
                0) if row['label'] in self.normal_classes else torch.tensor(1)
            row['text'] = row['text'].lower()

        # Subset train_set to normal class
        self.train_set = Subset(self.train_set, train_idx_normal)
        # Subset test_set to selected normal classes
        self.test_n_set = Subset(self.test_set, test_n_idx)
        # Subset test_set to selected anomalous classes
        self.test_a_set = Subset(self.test_set, test_a_idx)

        # Make corpus and set encoder
        text_corpus = [
            row['text']
            for row in datasets_iterator(self.train_set, self.test_set)
        ]
        if tokenizer == 'spacy':
            self.encoder = SpacyEncoder(text_corpus,
                                        min_occurrences=3,
                                        append_eos=append_eos)
        if tokenizer == 'bert':
            self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased',
                                                           cache_dir=root)

        # Encode
        self.max_seq_len = 0
        for row in datasets_iterator(self.train_set, self.test_set):
            if append_sos:
                sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN]
                row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0),
                                         self.encoder.encode(row['text'])))
            else:
                row['text'] = self.encoder.encode(row['text'])
            if len(row['text']) > self.max_seq_len:
                self.max_seq_len = len(row['text'])

        # Compute tf-idf weights
        if use_tfidf_weights:
            compute_tfidf_weights(self.train_set,
                                  self.test_set,
                                  vocab_size=self.encoder.vocab_size)
        else:
            for row in datasets_iterator(self.train_set, self.test_set):
                row['weight'] = torch.empty(0)

        # Get indices after pre-processing
        for i, row in enumerate(self.train_set):
            row['index'] = i
        for i, row in enumerate(self.test_set):
            row['index'] = i

        # length prior
        sent_lengths = [len(row['text']) for row in self.train_set]
        sent_lengths_freq = np.bincount(np.array(sent_lengths))
        sent_lengths_freq = np.concatenate(
            (sent_lengths_freq,
             np.array((max_seq_len_prior - max(sent_lengths)) * [0])),
            axis=0)
        sent_lengths_freq = sent_lengths_freq + 1
        self.length_prior = np.log(sent_lengths_freq) - np.log(
            sent_lengths_freq.sum())
Exemplo n.º 8
0
def load_data(data_type,
              preprocessing=False,
              fine_grained=False,
              verbose=False,
              text_length=5000,
              encode=True):
    if data_type == 'imdb':
        train_data, test_data = imdb_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'newsgroups':
        train_data, test_data = newsgroups_dataset(preprocessing=preprocessing,
                                                   verbose=verbose,
                                                   text_length=text_length)
    elif data_type == 'reuters':
        train_data, test_data = reuters_dataset(preprocessing=preprocessing,
                                                fine_grained=fine_grained,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'webkb':
        train_data, test_data = webkb_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'cade':
        train_data, test_data = cade_dataset(preprocessing=preprocessing,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'dbpedia':
        train_data, test_data = dbpedia_dataset(preprocessing=preprocessing,
                                                verbose=verbose,
                                                text_length=text_length)
    elif data_type == 'agnews':
        train_data, test_data = agnews_dataset(preprocessing=preprocessing,
                                               verbose=verbose,
                                               text_length=text_length)
    elif data_type == 'yahoo':
        train_data, test_data = yahoo_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'sogou':
        train_data, test_data = sogou_dataset(preprocessing=preprocessing,
                                              verbose=verbose,
                                              text_length=text_length)
    elif data_type == 'yelp':
        train_data, test_data = yelp_dataset(preprocessing=preprocessing,
                                             fine_grained=fine_grained,
                                             verbose=verbose,
                                             text_length=text_length)
    elif data_type == 'amazon':
        train_data, test_data = amazon_dataset(preprocessing=preprocessing,
                                               fine_grained=fine_grained,
                                               verbose=verbose,
                                               text_length=text_length)
    else:
        raise ValueError('{} data type not supported.'.format(data_type))

    if encode:
        sentence_corpus = [
            row['text'] for row in datasets_iterator(train_data, )
        ]
        sentence_encoder = WhitespaceEncoder(
            sentence_corpus,
            reserved_tokens=[DEFAULT_PADDING_TOKEN, DEFAULT_UNKNOWN_TOKEN])
        label_corpus = [
            row['label'] for row in datasets_iterator(train_data, )
        ]
        label_encoder = LabelEncoder(label_corpus, reserved_labels=[])

        # Encode
        for row in datasets_iterator(train_data, test_data):
            row['text'] = sentence_encoder.encode(row['text'])
            row['label'] = label_encoder.encode(row['label'])
        return sentence_encoder, label_encoder, train_data, test_data
    else:
        return train_data, test_data
Exemplo n.º 9
0
def main():
    ROOT_DIR = os.path.join(str(Path.home()), '.torchtext')

    # define parameters and hyperparameters
    args = {
        'data_dir': ROOT_DIR,
        'use_cuda': True,
        'test_batch_size': 128,
        'dev_size': 0.1,
        'checkpoint': True,
        'early_stopping': False,
        'epochs': 5,
        'd_embedding': 300,
        'word_vectors': 'glove.840B.300d',
        'word_vectors_freeze': True,
        'vector_cache_dir': os.path.join(ROOT_DIR, 'vector_cache'),
        'momentum': .9,
        'seed': 42,
        'visdom_env': 'main',
    }

    args = Args(**args)

    vis = visdom.Visdom()
    if not vis.check_connection():
        raise RuntimeError(
            "Visdom server not running. Please run python -m visdom.server")

    torch.manual_seed(args.seed)

    device = torch.device('cuda' if args.use_cuda else 'cpu')

    # Load dataset splits
    train, test = trec_dataset(train=True, test=True, directory=args.data_dir)

    # Create encoders (TODO: best way to persist those?)
    text_corpus = [row['text'] for row in datasets_iterator(train, test)]
    text_encoder = WhitespaceEncoder(text_corpus)

    label_corpus = [row['label'] for row in datasets_iterator(train, test)]
    label_encoder = LabelEncoder(label_corpus)

    # encode dataset splits
    for row in datasets_iterator(train, test):
        row['text'] = text_encoder.encode(row['text'])
        row['label'] = label_encoder.encode(row['label'])

    # create sampler for train / dev split used in dataloader
    train_sampler, dev_sampler = train_test_split_sampler(
        train, test_size=args.dev_size, random_state=args.seed)

    def delete_checkpoint(path):
        checkpoint_files = list(path.glob('checkpoint_model*.pth'))
        if checkpoint_files:
            os.remove(checkpoint_files[0])

    visdom_logger = VisdomRunSummaryLogger(env=args.visdom_env,
                                           clear_batch_summary=True)

    # TODO: abstract this part
    run_config = {'run': 0}

    # train function
    def train_f(config):

        run_name = 'run_%d' % run_config['run']
        run_config['run'] = run_config['run'] + 1

        visdom_logger.new_run(run_name)

        model_path = Path('/tmp/models/')

        delete_checkpoint(model_path)

        train_batch_sampler = FlexibleBucketBatchSampler(
            train,
            config.batch_size,
            sampler=train_sampler,
            drop_last=True,
            sort_key=lambda r: len(row['text']))

        train_loader = DataLoader(train,
                                  batch_sampler=train_batch_sampler,
                                  collate_fn=collate_fn,
                                  pin_memory=config.use_cuda,
                                  num_workers=0)

        dev_batch_sampler = FlexibleBucketBatchSampler(
            train,
            config.test_batch_size,
            drop_last=True,
            sampler=dev_sampler,
            sort_key=lambda r: len(row['text']))

        dev_loader = DataLoader(train,
                                batch_sampler=dev_batch_sampler,
                                collate_fn=collate_fn,
                                pin_memory=config.use_cuda,
                                num_workers=0)

        test_sampler = BucketBatchSampler(test,
                                          config.test_batch_size,
                                          drop_last=True,
                                          sort_key=lambda r: len(row['text']))

        test_loader = DataLoader(test,
                                 batch_sampler=test_sampler,
                                 collate_fn=collate_fn,
                                 pin_memory=config.use_cuda,
                                 num_workers=0)

        embedding = nn.Embedding(text_encoder.vocab_size, config.d_embedding)

        if config.word_vectors_freeze:
            embedding.weight.requires_grad = False

        if config.word_vectors:
            # Load word vectors
            word_vectors = word_to_vector.aliases[config.word_vectors](
                cache=config.vector_cache_dir)
            for i, token in enumerate(text_encoder.vocab):
                embedding.weight.data[i] = word_vectors[token]
            print(
                'Found vectors for %d tokens in vocabulary' %
                len([t for t in text_encoder.vocab if t in word_vectors.stoi]))

        model = LSTMClassifier(d_in=embedding.embedding_dim,
                               d_out=label_encoder.vocab_size,
                               d_hidden=config.d_hidden,
                               dropout=config.dropout,
                               embedding=embedding)
        model.to(device)

        optimizer_params = list(
            filter(lambda p: p.requires_grad, model.parameters()))

        optimizer = torch.optim.SGD(optimizer_params,
                                    lr=config.lr,
                                    momentum=config.momentum)

        trainer = create_supervised_trainer(model,
                                            optimizer,
                                            F.nll_loss,
                                            device=device)

        evaluator_train = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        evaluator_dev = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        visdom_logger.attach_trainer(trainer)
        visdom_logger.attach_evaluator(evaluator_train, trainer, phase='train')
        visdom_logger.attach_evaluator(evaluator_dev, trainer, phase='dev')

        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lambda epoch_: 1. / (1 + config.lr_decay *
                                            (epoch_ - 1)))

        # scoring function for early stopping and checkpointing
        def score_function(engine):
            dev_loss = engine.state.metrics['nll']
            return -dev_loss

        early_stopping = EarlyStopping(patience=15,
                                       score_function=score_function,
                                       trainer=trainer)

        def checkpoint_score_function(engine):
            dev_accuracy = engine.state.metrics['accuracy']
            return dev_accuracy

        checkpoint = ModelCheckpoint('/tmp/models',
                                     'checkpoint',
                                     score_function=checkpoint_score_function,
                                     n_saved=1,
                                     create_dir=True,
                                     score_name="dev_accuracy")

        # lets train!
        train_model(
            model=model,
            trainer=trainer,
            epochs=config.epochs,
            evaluator_train=evaluator_train,
            evaluator_dev=evaluator_dev,
            train_loader=train_loader,
            dev_loader=dev_loader,
            lr_scheduler=lr_scheduler,
            early_stopping=early_stopping if config.early_stopping else None,
            checkpoint=checkpoint if config.checkpoint else None)

        # load checkpointed (best) model and evaluate on test loader
        model = torch.load(list(model_path.glob('checkpoint_model*.pth'))[0])

        test_evaluator = \
            create_supervised_evaluator(model,
                                        metrics={
                                            'accuracy': CategoricalAccuracy(),
                                            'nll': Loss(F.nll_loss)},
                                        device=device)

        test_evaluator.run(test_loader)
        metrics = test_evaluator.state.metrics
        print("Test Results: Avg accuracy: {:.2f} Avg loss: {:.2f}".format(
            metrics['accuracy'], metrics['nll']))

        test_evaluator.run(dev_loader)
        metrics = test_evaluator.state.metrics
        return metrics['nll']

    # hyperparameter tuning!
    hp_opt = HPOptimizer(args=args,
                         strategy='gp',
                         space=[
                             Real(0.1, 0.5, name='dropout'),
                             Categorical([50, 100, 150, 200], name='d_hidden'),
                             Real(1e-4, 1, prior='log-uniform', name='lr'),
                             Real(1e-3,
                                  1,
                                  prior='log-uniform',
                                  name='lr_decay'),
                             Categorical([4, 8, 16, 32, 64, 128],
                                         name='batch_size')
                         ])

    hp_opt.add_callback(visdom_logger.run_summary)

    result = hp_opt.minimize(train_f, n_calls=10)
    print(result)