Пример #1
0
    def __init__(self, data_dir, txt_fname,verbose=True):
        file_contents = utils.process_txt(os.path.join(data_dir, txt_fname))

        img_paths = []
        labels = []
        for label, fname in file_contents:
            img_paths.append(os.path.join(data_dir, label, fname))
            labels.append(classes_dict[label])
        
        self.img_paths = np.asarray(img_paths)
        self.labels = np.asarray(labels)
        assert self.img_paths.shape[0] == self.labels.shape[0]
 def parse_corpus(self, raw=False):
     corpus = {}
     i = 1
     with open(self.corpus_path) as f:
         reader = csv.reader(f)
         for msg in reader:
             if raw:
                 corpus[int(i)] = str(msg[0])
             else:
                 corpus[int(i)] = utils.process_txt(str(msg[0]))
             i = i + 1
     return corpus
Пример #3
0
    def __init__(self, data_dir, txt_fname="data.txt", val_split=0.1, seed=11, verbose=True, convert="L"):
        self.verbose = verbose 
        self.val_split = val_split
        self.convert = convert

        file_contents = utils.process_txt(os.path.join(data_dir, txt_fname))
        img_paths = []
        labels = []
        for label, fname in file_contents:
            img_paths.append(os.path.join(data_dir, label, fname))
            labels.append(label)

        # create a classes dict automatically from labels 
        siemese_classes_dict = {}
        key = 0 
        for label in labels: 
            if label not in siemese_classes_dict.keys():
                siemese_classes_dict[label] = key
                key += 1

        img_paths = np.asarray(img_paths)
        labels = np.asarray([siemese_classes_dict[x] for x in labels])

        self.train_img_paths = None
        self.train_labels = None
        self.val_img_paths = None
        self.val_lables = None

        if val_split > 0.0: 
            np.random.seed(seed)

            n_train = int(labels.shape[0] * (1.0 - val_split))
            order = np.random.permutation(labels.shape[0])

            self.val_img_paths = np.asarray(img_paths[order[n_train:]])
            self.val_labels= np.asarray(labels[order[n_train:]])

            self.train_img_paths = np.asarray(img_paths[order[:n_train]])
            self.train_labels = np.asarray(labels[order[:n_train]])
            assert self.val_img_paths.shape[0] == self.val_labels.shape[0]

            if self.verbose:
                print("{} validation split from training".format(self.val_labels.shape[0]))
                print("{} training remains".format(self.train_labels.shape[0]))

        else:
            self.train_img_paths = img_paths
            self.train_labels = img_paths
            
            if self.verbose:
                print("{} training images".format(self.train_labels.shape[0]))
Пример #4
0
 def parse_corpus(self, raw=False):
     '''
     Parse the corpus text document into a list of dictionaries with key being id and value being the msg)
     :param raw: if True, we will not tokenize and filter the words in the messages; useful if we want to return the original message;
     :return: a list of dictionaries with document ids as keys, and either the raw msg or a list of tokenized, processed
     words
     '''
     corpus = {}
     with open(self.corpus_path) as f:
         reader=csv.reader(f,delimiter='\t')
         for id, msg in reader:
             if raw:
                 corpus[int(unicode(id, "utf-8-sig"))] = unicode(msg,errors="replace")
             else:
                 corpus[int(unicode(id, "utf-8-sig"))] = utils.process_txt(unicode(msg,errors="replace"))
     return corpus;
Пример #5
0
    def __init__(self, data_dir, val_split=0.1, seed=11, uniform_sampling=False, verbose=True):
        self.uniform_sampling = uniform_sampling 
        self.verbose = verbose 
        self.val_split = val_split

        file_contents = utils.process_txt(os.path.join(data_dir, "data.txt"))

        img_paths = []
        labels = []
        
        for label, fname in file_contents:
            img_paths.append(os.path.join(data_dir, label, fname))
            labels.append(classes_dict[label])
        
        img_paths = np.asarray(img_paths)
        labels = np.asarray(labels)

        self.train_img_paths = None
        self.train_labels = None
        self.val_img_paths = None
        self.val_lables = None

        if val_split > 0.0: 
            np.random.seed(seed)

            n_train = int(labels.shape[0] * (1.0 - val_split))
            order = np.random.permutation(labels.shape[0])

            self.val_img_paths = np.asarray(img_paths[order[n_train:]])
            self.val_labels= np.asarray(labels[order[n_train:]])

            self.train_img_paths = np.asarray(img_paths[order[:n_train]])
            self.train_labels = np.asarray(labels[order[:n_train]])
            assert self.val_img_paths.shape[0] == self.val_labels.shape[0]

            if self.verbose:
                print("{} validation split from training".format(self.val_labels.shape[0]))
                print("{} training remains".format(self.train_labels.shape[0]))

        else:
            self.train_img_paths = img_paths
            self.train_labels = img_paths
            
            if self.verbose:
                print("{} training images".format(self.train_labels.shape[0]))
Пример #6
0
    def __init__(self, data_dir, txt_fname, verbose=True, convert="L"):
        self.convert=convert
        file_contents = utils.process_txt(os.path.join(data_dir, txt_fname))
        img_paths = []
        labels = []
        for label, fname in file_contents:
            img_paths.append(os.path.join(data_dir, label, fname))
            labels.append(label)

        # create a classes dict automatically from labels 
        siemese_classes_dict = {}
        key = 0 
        for label in labels: 
            if label not in siemese_classes_dict.keys():
                siemese_classes_dict[label] = key
                key += 1

        img_paths = np.asarray(img_paths)
        labels = np.asarray([siemese_classes_dict[x] for x in labels])
        
        self.img_paths = np.asarray(img_paths)
        self.labels = np.asarray(labels)
        assert self.img_paths.shape[0] == self.labels.shape[0]