def test_get_bow_and_categories(self): imdb = Imdb(config.DATASETS_FOLDER) num_features = 5000 (train_x_bow, train_categories), (_, _) = imdb.get_bow_and_categories( max_features=num_features) self.assertEqual(train_x_bow.shape, (25000, num_features)) self.assertEqual(len(train_categories), 25000)
def __init__(self, image_set, year, use_diff=False): name = 'voc_' + year + '_' + image_set if use_diff: name += '_diff' Imdb.__init__(self, name) self._year = year self._image_set = image_set self._devkit_path = self._get_default_path() self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) self._classes = ( '__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') self._class_to_ind = dict( list(zip(self.classes, list(range(self.num_classes))))) self._image_ext = '.jpg' self._image_index = self._load_image_set_index() # Default to roidb handler self._roidb_handler = self.gt_roidb self._salt = str(uuid.uuid4()) self._comp_id = 'comp4' # PASCAL specific config options self.config = { 'cleanup': True, 'use_salt': True, 'use_diff': use_diff, 'matlab_eval': False, 'rpn_file': None } assert os.path.exists(self._devkit_path), \ 'VOCdevkit path does not exist: {}'.format(self._devkit_path) assert os.path.exists(self._data_path), \ 'Path does not exist: {}'.format(self._data_path)
def rpn_roidb(self): if int(self._year) == 2007 or self._image_set != 'test': gt_roidb = self.gt_roidb() rpn_roidb = self._load_rpn_roidb(gt_roidb) roidb = Imdb.merge_roidbs(gt_roidb, rpn_roidb) else: roidb = self._load_rpn_roidb(None) return roidb
# Accuracy: 0.76628 (with 5k words) 0.80664 (with 50k words) 0.81732 (with 250k words) 0.81828 (with 500k words) 0.81892 (with entire vocab) """ from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import config from datasets.imdb import Imdb # Load dataset print('Loading dataset...', flush=True) imdb = Imdb(config.DATASETS_FOLDER) (train_x_bof, train_y), (test_x_bof, test_y) = imdb.get_bof_fasttext_wiki_news_300d_1M() # Train LR model print('Training model...', flush=True) lm = LogisticRegression() lm.fit(train_x_bof, train_y) # Predict and score on test set ps = lm.predict(test_x_bof) acc = accuracy_score(test_y, ps) print(f'Accuracy: {acc}', flush=True)
""" from torch import FloatTensor as T from torch.autograd import Variable as V from torch.nn import CrossEntropyLoss from torch import nn import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score import config from datasets.imdb import Imdb # Load dataset print('Loading dataset...') imdb = Imdb(config.DATASETS_FOLDER) (train_x_bow, train_y), (test_x_bow, test_y) = imdb.get_bow_and_categories(max_features=5000) # Pack dataset to torch Variables train_x_bow = V(T(train_x_bow.toarray()), requires_grad=False) test_x_bow = V(T(test_x_bow.toarray()), requires_grad=False) train_y = V(T(train_y), requires_grad=False).long() test_y = V(T(test_y), requires_grad=False).long() # Compute train mean and std train_mean = train_x_bow.mean(0) train_std = train_x_bow.std(0) # Normalize train and test sets
def test_get_texts_and_categories(self): imdb = Imdb(config.DATASETS_FOLDER) (train_texts, train_categories), (_, _) = imdb.get_texts_and_categories() self.assertEqual(len(train_texts), 25000) self.assertEqual(len(train_categories), 25000)
# Accuracy 0.85144 (with 5k words) 0.87032 (with the entire vocab) """ from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import config from datasets.imdb import Imdb # Load dataset print('Loading dataset...') imdb = Imdb(config.DATASETS_FOLDER) (train_x_bow, train_y), (test_x_bow, test_y) = imdb.get_bow_and_categories(max_features=5000) # Train LR model print('Training model...') lm = LogisticRegression() lm.fit(train_x_bow, train_y) # Predict and score on test set ps = lm.predict(test_x_bow) acc = accuracy_score(test_y, ps) print(f'Accuracy: {acc}')