def load_obj(load_path): """ Loads a saved on-disk representation to a python data structure. We currently support the following file formats: * python pickle (.pkl) Arguments: load_path (str): where to the load the serialized object (full path and file name) """ if isinstance(load_path, str): load_path = os.path.expandvars(os.path.expanduser(load_path)) if load_path.endswith('.gz'): import gzip load_path = gzip.open(load_path) else: load_path = open(load_path) fname = load_path.name logger.debug("deserializing object from: %s", fname) try: return pickle.load(load_path) except AttributeError: msg = ("Problems deserializing: %s. Its possible the interface " "for this object has changed since being serialized. You " "may need to remove and recreate it." % load_path) logger.error(msg) raise AttributeError(msg)
def load_data(self): data_file = os.path.join( self.path, 'findata-' + str(self.nlags) + '-' + str(self.quick) + '.pkl') if os.path.exists(data_file): print("Loading cached data from %s" % data_file) (self.nfeats, self.train_x, self.train_y, self.valid_x, self.valid_y) = pickle.load(file(data_file)) return print("Processing data...") full = pd.read_hdf(os.path.join(self.path, self.filename), 'train') meds = full.median(axis=0) full.fillna(meds, inplace=True) cols = [ col for col in full.columns if col not in ['id', 'timestamp', 'y'] ] self.nfeats = len(cols) uniq_ts = full['timestamp'].unique() mid = uniq_ts[len(uniq_ts) / 2] train = full[full.timestamp < mid].reset_index() valid = full[full.timestamp >= mid].reset_index() if self.quick: train = train[train.id < 200].reset_index() valid = valid[valid.id < 200].reset_index() train_x, train_y = self.process(train, cols, self.nlags) valid_x, valid_y = self.process(valid, cols, self.nlags) self.train_x, self.train_y = self.shuffle(train_x, train_y) self.valid_x, self.valid_y = valid_x, valid_y pickle.dump((self.nfeats, self.train_x, self.train_y, self.valid_x, self.valid_y), file(data_file, 'w')) print("Saved data to %s" % data_file)
def load_obj(load_path, verbose=True): """ Loads a saved on-disk representation to a python data structure. We currently support the following file formats: * python pickle (.pkl) Arguments: load_path (str): where to the load the serialized object (full path and file name) """ if isinstance(load_path, str): load_path = os.path.expandvars(os.path.expanduser(load_path)) if load_path.endswith('.gz'): import gzip load_path = gzip.open(load_path) else: load_path = open(load_path) fname = load_path.name if verbose: logger.warn("deserializing object from: %s", fname) try: return pickle.load(load_path) except AttributeError: msg = ("Problems deserializing: %s. Its possible the interface " "for this object has changed since being serialized. You " "may need to remove and recreate it." % load_path) logger.error(msg) raise AttributeError(msg)
def pad_data(path, vocab_size=20000, sentence_length=100, oov=2, start=1, index_from=3, seed=113, test_split=0.2): f = open(path, 'rb') X, y = pickle.load(f) f.close() np.random.seed(seed) np.random.shuffle(X) np.random.seed(seed) np.random.shuffle(y) if start is not None: X = [[start] + [w + index_from for w in x] for x in X] else: X = [[w + index_from for w in x] for x in X] if not vocab_size: vocab_size = max([max(x) for x in X]) # by convention, use 2 as OOV word # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 # (start), 2 (OOV) if oov is not None: X = [[oov if w >= vocab_size else w for w in x] for x in X] X_train = X[:int(len(X) * (1 - test_split))] y_train = y[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = y[int(len(X) * (1 - test_split)):] X_train = pad_sentences(X_train, sentence_length=sentence_length) y_train = np.array(y_train).reshape((len(y_train), 1)) X_test = pad_sentences(X_test, sentence_length=sentence_length) y_test = np.array(y_test).reshape((len(y_test), 1)) nclass = 1 + max(np.max(y_train), np.max(y_test)) return (X_train, y_train), (X_test, y_test), nclass
def compare_files(file1, file2): """ Helper function to compare two serialized model files This is only comparing the model weights and states and layer config parameters Returns: bool: True if the two file match """ models = [] for fn in [file1, file2]: assert os.path.exists(fn), 'Could not find file %s' % fn with open(fn, 'r') as fid: models.append(ModelDescription(pickle.load(fid))) return models[0] == models[1]
def deserialize(load_path, verbose=True): """ Converts a serialized object into a python data structure. We currently support reading from the following file formats (expected filename extension in brackets): * python pickle (.pkl) * YAML (.yaml) Arguments: load_path (str, File): path and name of the serialized on-disk file to load (or an already loaded file object). The type to write is inferred based on filename extension. If no extension given, pickle format is attempted. Returns: object: Converted in-memory python data structure. See Also: serialize """ if not isinstance(load_path, file): load_path = file(os.path.expandvars(os.path.expanduser(load_path))) fname = load_path.name if verbose: logger.warn("deserializing object from: %s", fname) if (fname.lower().endswith('.yaml') or fname.lower().endswith('.yml')): initialize_yaml() return yaml.safe_load(load_path) else: try: return pickle.load(load_path) except AttributeError: msg = ("Problems deserializing: %s. Its possible the interface " "for this object has changed since being serialized. You " "may need to remove and recreate it." % load_path) logger.error(msg) raise AttributeError(msg)
def convert_file(iopair, keylist): """ Function for converting from an imageset batch cpickle file into a flat binary with a choice of keys. Input file is cpickled dict with the following fields: dict['data']: list of jpeg strings dict['labels']: dict of integer lists, default is 'l_id' for the category label of the corresponding jpeg. The following condition should be true (a label for each jpeg) len(dict['data']) == len(dict['labels']['l_id']) Arguments: iopair(tuple): Names of input and output files. keylist(list): A list of keys to be used in the flat binary file. """ ifname, ofname = iopair with open(ifname, 'rb') as ifp: neon_logger.display("Converting {}".format(ifname)) tdata = pickle.load(ifp) jpegs = tdata['data'] labels = tdata['labels'] num_imgs = len(jpegs) with open(ofname, 'wb') as f: f.write(struct.pack('I', num_imgs)) f.write(struct.pack('I', len(keylist))) for key in keylist: ksz = len(key) f.write(struct.pack('L' + 'B' * ksz, ksz, *bytearray(key))) f.write(struct.pack('I' * num_imgs, *labels[key])) for i in range(num_imgs): jsz = len(jpegs[i]) bin = struct.pack('I' + 'B' * jsz, jsz, *bytearray(jpegs[i])) f.write(bin)
def deserialize(self, load_path): fd = open(load_path, 'r') obj = pickle.load(fd) fd.close() return obj
gradient_limit = 5 clip_gradients = True num_epochs = args.epochs embedding_update = True # setup backend be = gen_backend(**extract_valid_args(args, gen_backend)) # get the preprocessed and tokenized data fname_h5, fname_vocab = build_data_train(filepath=args.review_file, vocab_file=args.vocab_file, skip_headers=True) # play around with google-news word vectors for init if args.use_w2v: w2v_file = args.w2v vocab, rev_vocab = pickle.load(open(fname_vocab, "rb")) init_emb_np, embedding_dim, _ = get_google_word2vec_W(w2v_file, vocab, vocab_size=vocab_size, index_from=3) neon_logger.display("Done loading the Word2Vec vectors: embedding size - {}".format(embedding_dim)) embedding_update = True init_emb = Array(val=be.array(init_emb_np)) else: init_emb = Uniform(-0.1 / embedding_dim, 0.1 / embedding_dim) h5f = h5py.File(fname_h5, "r") reviews, h5train, h5valid = h5f["reviews"], h5f["train"], h5f["valid"] ntrain, nvalid, nclass = reviews.attrs["ntrain"], reviews.attrs["nvalid"], reviews.attrs["nclass"] # make train dataset Xy = h5train[:ntrain]
clip_gradients = True num_epochs = args.epochs embedding_update = True # setup backend be = gen_backend(**extract_valid_args(args, gen_backend)) # get the preprocessed and tokenized data fname_h5, fname_vocab = build_data_train(filepath=args.review_file, vocab_file=args.vocab_file, skip_headers=True) # play around with google-news word vectors for init if args.use_w2v: w2v_file = args.w2v vocab, rev_vocab = pickle.load(open(fname_vocab, 'rb')) init_emb_np, embedding_dim, _ = get_google_word2vec_W(w2v_file, vocab, vocab_size=vocab_size, index_from=3) neon_logger.display( "Done loading the Word2Vec vectors: embedding size - {}".format(embedding_dim)) embedding_update = True init_emb = Array(val=be.array(init_emb_np)) else: init_emb = Uniform(-0.1 / embedding_dim, 0.1 / embedding_dim) h5f = h5py.File(fname_h5, 'r') reviews, h5train, h5valid = h5f['reviews'], h5f['train'], h5f['valid'] ntrain, nvalid, nclass = reviews.attrs[ 'ntrain'], reviews.attrs['nvalid'], reviews.attrs['nclass']
# load the weights print("Initialized the models - ") model_new = Model(layers=layers) print("Loading the weights from {0}".format(args.model_weights)) model_new.load_params(args.model_weights) model_new.initialize(dataset=(sentence_length, batch_size)) # setup buffers before accepting reviews xdev = be.zeros((sentence_length, 1), dtype=np.int32) # bsz is 1, feature size xbuf = np.zeros((1, sentence_length), dtype=np.int32) oov = 2 start = 1 index_from = 3 pad_char = 0 vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb')) while True: line = input('Enter a Review from testData.tsv file \n') # clean the input tokens = clean_string(line).strip().split() # check for oov and add start sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens] sent = [start] + [w + index_from for w in sent] sent = [oov if w >= vocab_size else w for w in sent] # pad sentences xbuf[:] = 0 trunc = sent[-sentence_length:]
clip_gradients = True num_epochs = args.epochs embedding_update = True # setup backend be = gen_backend(**extract_valid_args(args, gen_backend)) # get the preprocessed and tokenized data fname_h5, fname_vocab = build_data_train(filepath=args.review_file, vocab_file=args.vocab_file, skip_headers=True) # play around with google-news word vectors for init if args.use_w2v: w2v_file = args.w2v vocab, rev_vocab = pickle.load(open(fname_vocab, 'rb')) init_emb_np, embedding_dim, _ = get_google_word2vec_W( w2v_file, vocab, vocab_size=vocab_size, index_from=3) neon_logger.display( "Done loading the Word2Vec vectors: embedding size - {}".format( embedding_dim)) embedding_update = True init_emb = Array(val=be.array(init_emb_np)) else: init_emb = Uniform(-0.1 / embedding_dim, 0.1 / embedding_dim) h5f = h5py.File(fname_h5, 'r') reviews, h5train, h5valid = h5f['reviews'], h5f['train'], h5f['valid'] ntrain, nvalid, nclass = reviews.attrs['ntrain'], reviews.attrs[ 'nvalid'], reviews.attrs['nclass']
def voc_eval(detpath, annopath, imagesetfile, classname, cachedir, ovthresh=0.5, use_07_metric=False): """rec, prec, ap = voc_eval(detpath, annopath, imagesetfile, classname, [ovthresh], [use_07_metric]) Top level function that does the PASCAL VOC evaluation. detpath: Path to detections detpath.format(classname) should produce the detection results file. annopath: Path to annotations annopath.format(imagename) should be the xml annotations file. imagesetfile: Text file containing the list of images, one image per line. classname: Category name (duh) cachedir: Directory for caching the annotations [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # cachedir caches the annotations in a pickle file # first load gt if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') # read list of images with open(imagesetfile, 'rb') as f: lines = f.readlines() imagenames = [x.strip() for x in lines] if not os.path.isfile(cachefile): # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec(annopath.format(imagename)) if i % 100 == 0: neon_logger.display('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save neon_logger.display( 'Saving cached annotations to {:s}'.format(cachefile)) with open(cachefile, 'wb') as f: pickle.dump(recs, f, 2) else: # load with open(cachefile, 'rb') as f: recs = pickle.load(f) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = { 'bbox': bbox, 'difficult': difficult, 'det': det } # read dets detfile = detpath.format(classname) with open(detfile, 'rb') as f: lines = f.readlines() splitlines = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) # sort by confidence sorted_ind = np.argsort(-confidence) # sorted_scores = np.sort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) prec = tp / (tp + fp + 1e-10) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap
def voc_eval(detpath, annopath, imagesetfile, classname, cachedir, ovthresh=0.5, use_07_metric=False): """rec, prec, ap = voc_eval(detpath, annopath, imagesetfile, classname, [ovthresh], [use_07_metric]) Top level function that does the PASCAL VOC evaluation. detpath: Path to detections detpath.format(classname) should produce the detection results file. annopath: Path to annotations annopath.format(imagename) should be the xml annotations file. imagesetfile: Text file containing the list of images, one image per line. classname: Category name (duh) cachedir: Directory for caching the annotations [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # cachedir caches the annotations in a pickle file # first load gt if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') # read list of images with open(imagesetfile, 'rb') as f: lines = f.readlines() imagenames = [x.strip() for x in lines] if not os.path.isfile(cachefile): # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec(annopath.format(imagename)) if i % 100 == 0: neon_logger.display('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save neon_logger.display( 'Saving cached annotations to {:s}'.format(cachefile)) with open(cachefile, 'wb') as f: pickle.dump(recs, f, 2) else: # load with open(cachefile, 'rb') as f: recs = pickle.load(f) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = {'bbox': bbox, 'difficult': difficult, 'det': det} # read dets detfile = detpath.format(classname) with open(detfile, 'rb') as f: lines = f.readlines() splitlines = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) # sort by confidence sorted_ind = np.argsort(-confidence) # sorted_scores = np.sort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) prec = tp / (tp + fp + 1e-10) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap