def create_train_df(prod_info, category_idx, shuffle=None):
    images_df = create_images_df(prod_info, False)
    prod_info['category_idx'] = map_categories(category_idx,
                                               prod_info['category_id'])
    images_df = images_df.merge(prod_info, on='product_id', how='left')[[
        'product_id', 'category_idx', 'img_idx', 'num_imgs'
    ]]
    if shuffle:
        np.random.seed(shuffle)
        perm = np.random.permutation(images_df.shape[0])
        images_df = images_df.reindex(perm)
        images_df.reset_index(drop=True, inplace=True)
    return images_df
def train_data(memmap_path,
               memmap_len,
               prod_info,
               sample_prod_info,
               train_split,
               category_idx,
               batch_size,
               shuffle=None,
               batch_seed=123,
               use_side_input=False):
    images_df = create_images_df(prod_info, False)
    prod_info['category_idx'] = map_categories(category_idx,
                                               prod_info['category_id'])
    prod_info = prod_info.merge(train_split, on='product_id', how='left')
    images_df = images_df.merge(prod_info, on='product_id', how='left')[[
        'product_id', 'category_idx', 'img_idx', 'num_imgs', 'train'
    ]]
    if shuffle:
        np.random.seed(shuffle)
        perm = np.random.permutation(images_df.shape[0])
        images_df = images_df.reindex(perm)
        images_df.reset_index(drop=True, inplace=True)
    images_df = images_df[images_df.product_id.isin(
        sample_prod_info.product_id)]
    train_df = images_df[images_df['train']]
    valid_df = images_df[~images_df['train']]
    num_classes = np.unique(images_df['category_idx']).size

    train_it = MemmapIterator(memmap_path=memmap_path,
                              memmap_shape=(memmap_len, 512, 2, 2),
                              images_df=train_df,
                              num_classes=num_classes,
                              seed=batch_seed,
                              batch_size=batch_size,
                              pool_wrokers=4,
                              use_side_input=use_side_input,
                              shuffle=True)
    valid_it = MemmapIterator(memmap_path=memmap_path,
                              memmap_shape=(memmap_len, 512, 2, 2),
                              images_df=valid_df,
                              num_classes=num_classes,
                              seed=batch_seed,
                              batch_size=batch_size,
                              pool_wrokers=4,
                              use_side_input=use_side_input,
                              shuffle=False)
    return train_it, valid_it, num_classes
def predict(memmap_path,
            memmap_len,
            prod_info,
            sample_prod_info,
            models_dir,
            use_side_input=False,
            batch_size=200,
            shuffle=None,
            top_k=10):
    model_file = os.path.join(models_dir, LOAD_MODEL)
    if os.path.exists(model_file):
        model = load_model(model_file)
    else:
        raise ValueError("Model doesn't exist")
    images_df = create_images_df(prod_info, False)
    images_df = images_df.merge(prod_info, on='product_id', how='left')[[
        'product_id', 'img_idx', 'num_imgs'
    ]]
    if shuffle:
        np.random.seed(shuffle)
        perm = np.random.permutation(images_df.shape[0])
        images_df = images_df.reindex(perm)
        images_df.reset_index(drop=True, inplace=True)
    if sample_prod_info is not None:
        images_df = images_df[images_df.product_id.isin(
            sample_prod_info.product_id)]
    images_df.sort_values('product_id', inplace=True)
    dfs = []
    offset = 0
    while offset < images_df.shape[0]:
        end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5)
        while end_idx < images_df.shape[0]:
            if images_df.iloc[
                    end_idx -
                    1].product_id == images_df.iloc[end_idx].product_id:
                end_idx += 1
            else:
                break
        it = MemmapIterator(memmap_path=memmap_path,
                            memmap_shape=(memmap_len, 512, 2, 2),
                            images_df=images_df[offset:end_idx],
                            batch_size=batch_size,
                            pool_wrokers=1,
                            use_side_input=use_side_input,
                            shuffle=False)
        preds = model.predict_generator(it,
                                        it.samples / batch_size,
                                        verbose=1,
                                        max_queue_size=10)
        it.terminate()
        del it
        product_start = 0
        prev_product_id = 0
        chunk = []
        for i, row in enumerate(
                itertools.chain(
                    images_df[offset:(offset + preds.shape[0])].itertuples(),
                    [namedtuple('Pandas', ['product_id', 'img_idx'])(1, 0)])):
            if prev_product_id != 0 and prev_product_id != row.product_id:
                prods = preds[product_start:i].prod(axis=-2)
                prods = prods / prods.sum()
                top_k_preds = np.argpartition(prods, -top_k)[-top_k:]
                for pred_idx in range(top_k):
                    chunk.append((prev_product_id, 0, top_k_preds[pred_idx],
                                  prods[top_k_preds[pred_idx]]))
                product_start = i
            prev_product_id = row.product_id
        chunk_df = pd.DataFrame(
            chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
        dfs.append(chunk_df)
        offset += preds.shape[0]
        del preds
        del chunk
    return pd.concat(dfs)
def predict(memmap_path,
            memmap_len,
            prod_info,
            sample_prod_info,
            models_dir,
            batch_size=200,
            shuffle=None,
            top_k=10,
            use_img_idx=True):
    model_file = os.path.join(models_dir, LOAD_MODEL)
    if os.path.exists(model_file):
        model = load_model(model_file)
    else:
        raise ValueError("Model doesn't exist")
    images_df = create_images_df(prod_info, False)
    images_df = images_df.merge(prod_info, on='product_id', how='left')[[
        'product_id', 'img_idx', 'num_imgs'
    ]]
    if shuffle:
        np.random.seed(shuffle)
        perm = np.random.permutation(images_df.shape[0])
        images_df = images_df.reindex(perm)
        images_df.reset_index(drop=True, inplace=True)
    if sample_prod_info is not None:
        images_df = images_df[images_df.product_id.isin(
            sample_prod_info.product_id)]
    images_df.sort_values('product_id', inplace=True)
    dfs = []
    offset = 0
    while offset < images_df.shape[0]:
        end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5)
        while end_idx < images_df.shape[0]:
            if images_df.iloc[
                    end_idx -
                    1].product_id == images_df.iloc[end_idx].product_id:
                end_idx += 1
            else:
                break
        it = MultiMemmapIterator(memmap_path=memmap_path,
                                 memmap_shape=(memmap_len, 2048),
                                 images_df=images_df[offset:end_idx],
                                 batch_size=batch_size,
                                 pool_wrokers=1,
                                 only_single=False,
                                 include_singles=False,
                                 max_images=4,
                                 shuffle=False,
                                 use_side_input=use_img_idx)

        preds = model.predict_generator(it,
                                        it.samples / batch_size,
                                        verbose=1,
                                        max_queue_size=10)
        it.terminate()
        del it
        chunk = []
        for i, product_id in enumerate(
                images_df[offset:end_idx].product_id.unique()):
            top_k_preds = np.argpartition(preds[i], -top_k)[-top_k:]
            for pred_idx in range(top_k):
                chunk.append((product_id, 0, top_k_preds[pred_idx],
                              preds[i, top_k_preds[pred_idx]]))

        chunk_df = pd.DataFrame(
            chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
        dfs.append(chunk_df)
        offset = end_idx
        del preds
        del chunk
    return pd.concat(dfs)