Exemplo n.º 1
0
def download_movielens():
    filepath = os.path.join(DATASETS_DIR, ML_20M_ALT + '.zip')
    if not glob(filepath):
        download_file(DOWNLOAD_URL[ML_20M], filepath)

    LOG.info("Extracting")
    extract_file(filepath, DATASETS_DIR)
Exemplo n.º 2
0
def download_file(url, filename):
    if not os.path.isdir(DATASETS_DIR):
        os.makedirs(DATASETS_DIR)

    u = urllib.request.urlopen(url)
    with open(filename, 'wb') as f:
        meta = u.info()
        if (meta.get_all("Content-Length")):
            file_size = int(meta.get_all("Content-Length")[0])
            pbar = tqdm(total=file_size,
                        desc=basename(normpath(filename)),
                        unit='B',
                        unit_scale=True)

            file_size_dl = 0
            block_sz = 8192
            while True:
                buff = u.read(block_sz)
                if not buff:
                    break
                pbar.update(len(buff))
                file_size_dl += len(buff)
                f.write(buff)
            pbar.close()
        else:
            LOG.warning("No content length information")
            file_size_dl = 0
            block_sz = 8192
            for cyc in itertools.cycle('/–\\|'):
                buff = u.read(block_sz)
                if not buff:
                    break
                print(cyc, end='\r')
                file_size_dl += len(buff)
                f.write(buff)
Exemplo n.º 3
0
    def save_response_content(response, filename):
        CHUNK_SIZE = 8192

        if 'Content-Length' in response.headers.keys():
            filesize = response.headers['Content-Length']
            pbar = tqdm(total=filesize,
                        desc=basename(normpath(filename)),
                        unit='B',
                        unit_scale=True)

            with open(filename, "wb") as f:
                for chunk in response.iter_content(CHUNK_SIZE):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        pbar.update(len(chunk))

            pbar.close()
        else:
            LOG.warning("No content length information")
            with open(filename, "wb") as f:
                for chunk, cyc in zip(response.iter_content(CHUNK_SIZE),
                                      itertools.cycle('/–\\|')):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        print(cyc, end='\r')
Exemplo n.º 4
0
def parse_movielens(threshold=4, **kwargs):
    if os.path.isfile(BIN_DATA[ML_20M_ALT]):
        LOG.info("Already processed, skipping.")
        return

    source_file = os.path.join(DOWNLOAD[ML_20M_ALT], "ratings.csv")
    if not glob(source_file):
        download_movielens()

    LOG.info("Parsing movielens.")
    df = pd.read_csv(source_file)
    df.drop('timestamp', axis=1, inplace=True)
    df["rating"] = make_feedback_implicit(df["rating"], threshold)

    map_user_id = {u: i for i, u in enumerate(df.userId.unique())}
    map_movie_id = {m: i for i, m in enumerate(df.movieId.unique())}

    m_sp = sp.csr_matrix(
        (df.rating,
         ([map_user_id[u] for u in df.userId],
          [map_movie_id[m] for m in df.movieId])),
        shape=(len(map_user_id), len(map_movie_id))
    )

    m_sp.eliminate_zeros()
    save_as_npz(m_sp, BIN_DATA[ML_20M_ALT])
Exemplo n.º 5
0
def download_pinterest():
    filepath = os.path.join(DATASETS_DIR, PINTEREST + '.zip')
    if not glob(filepath):
        download_file_from_google_drive(DOWNLOAD_GOOGLE_DRIVE_ID[PINTEREST],
                                        filepath)
    LOG.info("Extracting")
    extract_file(filepath, DATASETS_DIR)
    os.rename(os.path.join(DATASETS_DIR, 'pinterest_iccv'),
              DOWNLOAD[PINTEREST])
Exemplo n.º 6
0
def download_lastfm():
    filepath = os.path.join(DATASETS_DIR, 'lastfm-dataset-360K.tar.gz')
    if not glob(DOWNLOAD[LASTFM]):
        download_file(DOWNLOAD_URL[LASTFM], filepath)

    LOG.info("Extracting")
    extract_file(filepath, DATASETS_DIR)
    os.rename(os.path.join(DATASETS_DIR, 'lastfm-dataset-360K'),
              os.path.join(DATASETS_DIR, 'lastfm'))
Exemplo n.º 7
0
def download_netflix():
    filepath = os.path.join(DATASETS_DIR, NETFLIX + '.tar.gz')
    if not glob(filepath):
        download_file(DOWNLOAD_URL[NETFLIX], filepath)

    LOG.info("Extracting 1/2")
    extract_file(filepath, tempfile.gettempdir())
    LOG.info("Extracting 2/2")
    extract_file(
        os.path.join(tempfile.gettempdir(), 'download', 'training_set.tar'),
        DATASETS_DIR)
    os.rename(os.path.join(DATASETS_DIR, 'training_set'), DOWNLOAD[NETFLIX])
Exemplo n.º 8
0
    def train(
        self,
        n_epochs: int,
        train_data: sparse.csr_matrix,
        validation_data_input: sparse.csr_matrix,
        validation_data_true: sparse.csr_matrix,
        batch_size_train: int,
        batch_size_validation: int,
        metrics: dict,  # Dict[str, matrix -> matrix -> float]
        validation_step: 10,
    ):
        """
        Train the model
        :param n_epochs: number of epochs
        :param train_data:  train matrix of shape users count x items count
        :param metrics: Dictionary of metric names to metric functions
        :param validation_step: If it's set to n then validation is run once every n epochs
        """

        self.metrics_history = defaultdict(lambda: [])
        self.time_elapsed_training_history = []
        self.time_elapsed_validation_history = []

        self.session.run(self.iter.initializer)
        for epoch in range(1, n_epochs + 1):

            self.log_which_epoch(epoch, n_epochs)
            init_time = time.time()

            for _ in range(self.n_batch_per_train):
                self.session.run(self.optimizer)

            training_duration = time.time() - init_time
            self.time_elapsed_training_history.append(training_duration)
            LOG.info("Train time:\t{}".format(training_duration))

            if epoch % validation_step == 0 or epoch == n_epochs:
                init_time = time.time()
                metrics_scores = self.test(validation_data_input,
                                           validation_data_true, metrics)

                for name, score in metrics_scores.items():
                    self.metrics_history[name].append(score)

                validation_duration = time.time() - init_time
                self.time_elapsed_validation_history.append(
                    validation_duration)
                LOG.info("Valid time:\t{}".format(validation_duration))
                self.log_metrics(epoch, metrics_scores, n_epochs)

        self.log_training_time()
Exemplo n.º 9
0
 def log_training_time(self):
     LOG.info("Total elapsed train time: {}".format(
         np.sum(self.time_elapsed_training_history)))
     LOG.info("Total elapsed valid time: {}".format(
         np.sum(self.time_elapsed_validation_history)))
     LOG.info("Epoch average train time: {}".format(
         np.mean(self.time_elapsed_training_history)))
     LOG.info("Epoch average valid time: {}".format(
         np.mean(self.time_elapsed_validation_history)))
Exemplo n.º 10
0
def parse_pinterest(**kwargs):
    if os.path.isfile(BIN_DATA[PINTEREST]):
        LOG.info("Already processed, skipping.")
        return

    data_file = 'subset_iccv_board_pins.bson'
    source_file = os.path.join(DOWNLOAD[PINTEREST], data_file)
    if not glob(source_file):
        raise Exception("Cannot find pinterest dataset")

    LOG.info("Parsing pinterest")

    with open(source_file, 'rb') as f:
        bsob = bson.decode_all(f.read())

    map_id_pin = dict()
    map_pin_id = dict()
    map_board_id = dict()
    map_id_board = dict()
    pins = 0

    board_pin_pairs = []
    for i, board in enumerate(bsob):
        map_id_board[i] = board
        map_board_id[board['board_id']] = i
        for pin in board['pins']:
            if (pin not in map_pin_id):
                map_pin_id[pin] = pins
                map_id_pin[pins] = pin
                pins += 1
            board_pin_pairs.append((map_board_id[board['board_id']], map_pin_id[pin]))
    boards = [board for (board, pin) in board_pin_pairs]
    pins = [pin for (board, pin) in board_pin_pairs]

    m_sp = sp.csr_matrix(([1] * len(boards), (boards, pins)), shape=(len(map_board_id), len(map_pin_id)))

    save_as_npz(m_sp, BIN_DATA[PINTEREST])
Exemplo n.º 11
0
def parse_lastfm(**kwargs):
    if os.path.isfile(BIN_DATA[LASTFM]):
        LOG.info("Already processed, skipping.")
        return

    data_file = 'usersha1-artmbid-artname-plays.tsv'
    source_file = os.path.join(DOWNLOAD[LASTFM], data_file)
    if not glob(source_file):
        download_lastfm()

    LOG.info("Parsing lastfm")
    df = pd.read_csv(source_file, delimiter='\t', names=["User", "Artist id", "Artist name", "Plays"], dtype=str)

    artist_column = list(zip([str(i) for i in df['Artist id']], [str(i) for i in df['Artist name']]))
    user_column = df['User']

    map_artist_id = {artist: i for i, artist in enumerate(sorted(set(artist_column)))}
    map_user_id = {user: i for i, user in enumerate(sorted(set(user_column)))}

    user_ids = [map_user_id[user] for user in user_column]
    artist_ids = [map_artist_id[artist] for artist in artist_column]

    m_sp = sp.csr_matrix(([1] * df.shape[0], (user_ids, artist_ids)), shape=(len(map_user_id), len(map_artist_id)))
    save_as_npz(m_sp, BIN_DATA[LASTFM])
Exemplo n.º 12
0
def load_dataset(dataset: str, *args, **kwargs):
    """
    Generic data loader.
    :param dataset: name of dataset to be loaded
    :return: 5 csr_matrices {train, valid_in, valid_out, test_in, test_out}
    """
    assert dataset in DATASETS, "Wrong dataset name"

    if dataset == ML_20M:
        out = load_and_parse_ML_20M(DOWNLOAD[ML_20M], *args, **kwargs)
        LOG.info("Done")
        return out

    handler_map_parse = {
        NETFLIX: parse_netflix,
        ML_20M_ALT: parse_movielens,
        LASTFM: parse_lastfm,
        PINTEREST: parse_pinterest
    }
    handler_map_parse[dataset]()

    out = load_data(BIN_DATA[dataset], *args, **kwargs)
    LOG.info("Done")
    return out
Exemplo n.º 13
0
def parse_netflix(threshold=3, **kwargs):
    if os.path.isfile(BIN_DATA[NETFLIX]):
        LOG.info("Already processed, skipping.")
        return

    files = glob(os.path.join(DOWNLOAD[NETFLIX], '*'))
    if not files:
        download_netflix()

    LOG.info("Parsing netflix")
    users = get_users(files)
    map_user_id = {u: i for i, u in enumerate(users)}

    csr_rows = []
    csr_columns = []
    csr_data = []

    LOG.info("Parsing netflix, step 2/2")
    for movie_id, file_path in tqdm(enumerate(files)):
        df = pd.read_csv(file_path, names=['User', 'Rating', 'Date'])
        df.drop(0, inplace=True)

        df['Rating'] = make_feedback_implicit(df['Rating'], threshold)

        rows = [map_user_id[user] for user in df['User']]
        columns = [movie_id] * len(rows)
        data = list(df['Rating'])

        assert len(rows) == len(columns) and len(columns) == len(data)

        csr_rows += rows
        csr_columns += columns
        csr_data += data

    m_sp = sp.csr_matrix((csr_data, (csr_rows, csr_columns)), shape=(len(users), len(files)))
    m_sp.eliminate_zeros()
    save_as_npz(m_sp, BIN_DATA[NETFLIX])
Exemplo n.º 14
0
import logging

from vae.config import LOG

_log_format = logging.Formatter("[%(name)s| %(levelname)s]: %(message)s")
_log_handler = logging.StreamHandler()
_log_handler.setFormatter(_log_format)
LOG.addHandler(_log_handler)
Exemplo n.º 15
0
 def log_which_epoch(self, epoch, n_epochs):
     LOG.info("Epoch: {}".format(epoch))
Exemplo n.º 16
0
 def log_metrics(self, epoch, metrics_scores, n_epochs):
     for name, score in metrics_scores.items():
         LOG.info("Mean {}:\t{}".format(name, score))