def __init__(self, data_dir, txt_fname,verbose=True): file_contents = utils.process_txt(os.path.join(data_dir, txt_fname)) img_paths = [] labels = [] for label, fname in file_contents: img_paths.append(os.path.join(data_dir, label, fname)) labels.append(classes_dict[label]) self.img_paths = np.asarray(img_paths) self.labels = np.asarray(labels) assert self.img_paths.shape[0] == self.labels.shape[0]
def parse_corpus(self, raw=False): corpus = {} i = 1 with open(self.corpus_path) as f: reader = csv.reader(f) for msg in reader: if raw: corpus[int(i)] = str(msg[0]) else: corpus[int(i)] = utils.process_txt(str(msg[0])) i = i + 1 return corpus
def __init__(self, data_dir, txt_fname="data.txt", val_split=0.1, seed=11, verbose=True, convert="L"): self.verbose = verbose self.val_split = val_split self.convert = convert file_contents = utils.process_txt(os.path.join(data_dir, txt_fname)) img_paths = [] labels = [] for label, fname in file_contents: img_paths.append(os.path.join(data_dir, label, fname)) labels.append(label) # create a classes dict automatically from labels siemese_classes_dict = {} key = 0 for label in labels: if label not in siemese_classes_dict.keys(): siemese_classes_dict[label] = key key += 1 img_paths = np.asarray(img_paths) labels = np.asarray([siemese_classes_dict[x] for x in labels]) self.train_img_paths = None self.train_labels = None self.val_img_paths = None self.val_lables = None if val_split > 0.0: np.random.seed(seed) n_train = int(labels.shape[0] * (1.0 - val_split)) order = np.random.permutation(labels.shape[0]) self.val_img_paths = np.asarray(img_paths[order[n_train:]]) self.val_labels= np.asarray(labels[order[n_train:]]) self.train_img_paths = np.asarray(img_paths[order[:n_train]]) self.train_labels = np.asarray(labels[order[:n_train]]) assert self.val_img_paths.shape[0] == self.val_labels.shape[0] if self.verbose: print("{} validation split from training".format(self.val_labels.shape[0])) print("{} training remains".format(self.train_labels.shape[0])) else: self.train_img_paths = img_paths self.train_labels = img_paths if self.verbose: print("{} training images".format(self.train_labels.shape[0]))
def parse_corpus(self, raw=False): ''' Parse the corpus text document into a list of dictionaries with key being id and value being the msg) :param raw: if True, we will not tokenize and filter the words in the messages; useful if we want to return the original message; :return: a list of dictionaries with document ids as keys, and either the raw msg or a list of tokenized, processed words ''' corpus = {} with open(self.corpus_path) as f: reader=csv.reader(f,delimiter='\t') for id, msg in reader: if raw: corpus[int(unicode(id, "utf-8-sig"))] = unicode(msg,errors="replace") else: corpus[int(unicode(id, "utf-8-sig"))] = utils.process_txt(unicode(msg,errors="replace")) return corpus;
def __init__(self, data_dir, val_split=0.1, seed=11, uniform_sampling=False, verbose=True): self.uniform_sampling = uniform_sampling self.verbose = verbose self.val_split = val_split file_contents = utils.process_txt(os.path.join(data_dir, "data.txt")) img_paths = [] labels = [] for label, fname in file_contents: img_paths.append(os.path.join(data_dir, label, fname)) labels.append(classes_dict[label]) img_paths = np.asarray(img_paths) labels = np.asarray(labels) self.train_img_paths = None self.train_labels = None self.val_img_paths = None self.val_lables = None if val_split > 0.0: np.random.seed(seed) n_train = int(labels.shape[0] * (1.0 - val_split)) order = np.random.permutation(labels.shape[0]) self.val_img_paths = np.asarray(img_paths[order[n_train:]]) self.val_labels= np.asarray(labels[order[n_train:]]) self.train_img_paths = np.asarray(img_paths[order[:n_train]]) self.train_labels = np.asarray(labels[order[:n_train]]) assert self.val_img_paths.shape[0] == self.val_labels.shape[0] if self.verbose: print("{} validation split from training".format(self.val_labels.shape[0])) print("{} training remains".format(self.train_labels.shape[0])) else: self.train_img_paths = img_paths self.train_labels = img_paths if self.verbose: print("{} training images".format(self.train_labels.shape[0]))
def __init__(self, data_dir, txt_fname, verbose=True, convert="L"): self.convert=convert file_contents = utils.process_txt(os.path.join(data_dir, txt_fname)) img_paths = [] labels = [] for label, fname in file_contents: img_paths.append(os.path.join(data_dir, label, fname)) labels.append(label) # create a classes dict automatically from labels siemese_classes_dict = {} key = 0 for label in labels: if label not in siemese_classes_dict.keys(): siemese_classes_dict[label] = key key += 1 img_paths = np.asarray(img_paths) labels = np.asarray([siemese_classes_dict[x] for x in labels]) self.img_paths = np.asarray(img_paths) self.labels = np.asarray(labels) assert self.img_paths.shape[0] == self.labels.shape[0]