def create_train_df(prod_info, category_idx, shuffle=None): images_df = create_images_df(prod_info, False) prod_info['category_idx'] = map_categories(category_idx, prod_info['category_id']) images_df = images_df.merge(prod_info, on='product_id', how='left')[[ 'product_id', 'category_idx', 'img_idx', 'num_imgs' ]] if shuffle: np.random.seed(shuffle) perm = np.random.permutation(images_df.shape[0]) images_df = images_df.reindex(perm) images_df.reset_index(drop=True, inplace=True) return images_df
def train_data(memmap_path, memmap_len, prod_info, sample_prod_info, train_split, category_idx, batch_size, shuffle=None, batch_seed=123, use_side_input=False): images_df = create_images_df(prod_info, False) prod_info['category_idx'] = map_categories(category_idx, prod_info['category_id']) prod_info = prod_info.merge(train_split, on='product_id', how='left') images_df = images_df.merge(prod_info, on='product_id', how='left')[[ 'product_id', 'category_idx', 'img_idx', 'num_imgs', 'train' ]] if shuffle: np.random.seed(shuffle) perm = np.random.permutation(images_df.shape[0]) images_df = images_df.reindex(perm) images_df.reset_index(drop=True, inplace=True) images_df = images_df[images_df.product_id.isin( sample_prod_info.product_id)] train_df = images_df[images_df['train']] valid_df = images_df[~images_df['train']] num_classes = np.unique(images_df['category_idx']).size train_it = MemmapIterator(memmap_path=memmap_path, memmap_shape=(memmap_len, 512, 2, 2), images_df=train_df, num_classes=num_classes, seed=batch_seed, batch_size=batch_size, pool_wrokers=4, use_side_input=use_side_input, shuffle=True) valid_it = MemmapIterator(memmap_path=memmap_path, memmap_shape=(memmap_len, 512, 2, 2), images_df=valid_df, num_classes=num_classes, seed=batch_seed, batch_size=batch_size, pool_wrokers=4, use_side_input=use_side_input, shuffle=False) return train_it, valid_it, num_classes
def predict(memmap_path, memmap_len, prod_info, sample_prod_info, models_dir, use_side_input=False, batch_size=200, shuffle=None, top_k=10): model_file = os.path.join(models_dir, LOAD_MODEL) if os.path.exists(model_file): model = load_model(model_file) else: raise ValueError("Model doesn't exist") images_df = create_images_df(prod_info, False) images_df = images_df.merge(prod_info, on='product_id', how='left')[[ 'product_id', 'img_idx', 'num_imgs' ]] if shuffle: np.random.seed(shuffle) perm = np.random.permutation(images_df.shape[0]) images_df = images_df.reindex(perm) images_df.reset_index(drop=True, inplace=True) if sample_prod_info is not None: images_df = images_df[images_df.product_id.isin( sample_prod_info.product_id)] images_df.sort_values('product_id', inplace=True) dfs = [] offset = 0 while offset < images_df.shape[0]: end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5) while end_idx < images_df.shape[0]: if images_df.iloc[ end_idx - 1].product_id == images_df.iloc[end_idx].product_id: end_idx += 1 else: break it = MemmapIterator(memmap_path=memmap_path, memmap_shape=(memmap_len, 512, 2, 2), images_df=images_df[offset:end_idx], batch_size=batch_size, pool_wrokers=1, use_side_input=use_side_input, shuffle=False) preds = model.predict_generator(it, it.samples / batch_size, verbose=1, max_queue_size=10) it.terminate() del it product_start = 0 prev_product_id = 0 chunk = [] for i, row in enumerate( itertools.chain( images_df[offset:(offset + preds.shape[0])].itertuples(), [namedtuple('Pandas', ['product_id', 'img_idx'])(1, 0)])): if prev_product_id != 0 and prev_product_id != row.product_id: prods = preds[product_start:i].prod(axis=-2) prods = prods / prods.sum() top_k_preds = np.argpartition(prods, -top_k)[-top_k:] for pred_idx in range(top_k): chunk.append((prev_product_id, 0, top_k_preds[pred_idx], prods[top_k_preds[pred_idx]])) product_start = i prev_product_id = row.product_id chunk_df = pd.DataFrame( chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob']) dfs.append(chunk_df) offset += preds.shape[0] del preds del chunk return pd.concat(dfs)
def predict(memmap_path, memmap_len, prod_info, sample_prod_info, models_dir, batch_size=200, shuffle=None, top_k=10, use_img_idx=True): model_file = os.path.join(models_dir, LOAD_MODEL) if os.path.exists(model_file): model = load_model(model_file) else: raise ValueError("Model doesn't exist") images_df = create_images_df(prod_info, False) images_df = images_df.merge(prod_info, on='product_id', how='left')[[ 'product_id', 'img_idx', 'num_imgs' ]] if shuffle: np.random.seed(shuffle) perm = np.random.permutation(images_df.shape[0]) images_df = images_df.reindex(perm) images_df.reset_index(drop=True, inplace=True) if sample_prod_info is not None: images_df = images_df[images_df.product_id.isin( sample_prod_info.product_id)] images_df.sort_values('product_id', inplace=True) dfs = [] offset = 0 while offset < images_df.shape[0]: end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5) while end_idx < images_df.shape[0]: if images_df.iloc[ end_idx - 1].product_id == images_df.iloc[end_idx].product_id: end_idx += 1 else: break it = MultiMemmapIterator(memmap_path=memmap_path, memmap_shape=(memmap_len, 2048), images_df=images_df[offset:end_idx], batch_size=batch_size, pool_wrokers=1, only_single=False, include_singles=False, max_images=4, shuffle=False, use_side_input=use_img_idx) preds = model.predict_generator(it, it.samples / batch_size, verbose=1, max_queue_size=10) it.terminate() del it chunk = [] for i, product_id in enumerate( images_df[offset:end_idx].product_id.unique()): top_k_preds = np.argpartition(preds[i], -top_k)[-top_k:] for pred_idx in range(top_k): chunk.append((product_id, 0, top_k_preds[pred_idx], preds[i, top_k_preds[pred_idx]])) chunk_df = pd.DataFrame( chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob']) dfs.append(chunk_df) offset = end_idx del preds del chunk return pd.concat(dfs)