예제 #1
0
    def __init__(self, data: DataFrame, user_review_dict: Dict[str, DataFrame],
                 item_review_dict: Dict[str, DataFrame], config: NarreConfig):
        """
        Init a NarreDataset.
        :param data: original data. ["userID","itemID","review","rating"]
        :param user_review_dict: the review grouped by userID
        :param item_review_dict: the review grouped by itemID
        :param config: the config of Narre model.
        """

        super().__init__()
        self.data = data
        self.user_review_dict = user_review_dict
        self.item_review_dict = item_review_dict
        self.config = config

        logger.info("Loading dataset...")

        self.user_review, self.user_id, self.item_ids_per_review = self.load_user_review_data(
        )
        self.item_review, self.item_id, self.user_ids_per_review = self.load_item_review_data(
        )

        ratings = self.data["rating"].to_list()
        self.ratings = torch.Tensor(ratings).view(-1, 1)

        logger.info("Dataset loaded.")
예제 #2
0
def save_model(model: torch.nn.Module, train_time: time.struct_time):
    path = "model/checkpoints/%s_%s.pt" % (
        model.__class__.__name__, time.strftime("%Y%m%d%H%M%S", train_time)
    )
    path = ROOT_DIR.joinpath(path)
    torch.save(model, path)
    logger.info(f"model saved: {path}")
예제 #3
0
def save_embedding_weights(word_vec, out_path="data/embedding_weight.pt"):
    """
    Save the weights of pre-trained word embedding model to file.
    Thus we don't need to load it when train our model.
    This helps to save RAM and model init time.
    """

    weight = torch.Tensor(word_vec.vectors)
    torch.save(weight, ROOT_DIR.joinpath(out_path))
    logger.info("Word embedding weight saved.")
예제 #4
0
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'):
    """
    Read pre-trained word embedding model, and add "<pad>" to it with zero weight.
    """

    logger.info("loading word2vec model...")
    path = ROOT_DIR.joinpath(path)
    word_vec = KeyedVectors.load_word2vec_format(path, binary=True)
    word_vec.add([PAD_WORD], np.zeros([1, 300]))
    logger.critical(f"PAD_WORD_ID is {word_vec.vocab[PAD_WORD].index}.")
    logger.info("word2vec model loaded.")
    return word_vec
예제 #5
0
def get_max_review_length(data: DataFrame, percentile: float = 0.85) -> int:
    """
    We set the max review length to 85% percentile of all data as default.
    """

    review_lengths = data["review"] \
        .apply(lambda review: len(review.split()))
    # max_length = review_lengths.max()
    max_length = int(review_lengths.quantile(percentile,
                                             interpolation="lower"))
    logger.info(f"Max review length = {max_length}.")
    return max_length
예제 #6
0
def train_model(model: BaseModel, train_data: DataFrame, dev_data: DataFrame):
    model_name = model.__class__.__name__
    train_time = time.localtime()
    add_log_file(logger, "log/%s_%s.log" % (model_name, time.strftime("%Y%m%d%H%M%S", train_time)))
    logger.info("Training %s..." % model_name)

    config: BaseConfig = model.config
    logger.info(config.__dict__)
    model.to(config.device)

    opt = torch.optim.Adam(model.parameters(), config.learning_rate, weight_decay=config.l2_regularization)
    lr_s = lr_scheduler.ExponentialLR(opt, gamma=config.learning_rate_decay)
    loss = torch.nn.MSELoss()

    last_progress = 0.
    last_loss = float("inf")
    train_data_iter = get_data_loader(train_data, config)
    dev_data_iter = get_data_loader(dev_data, config)
    batches_num = math.ceil(len(train_data) / float(config.batch_size))

    while model.current_epoch < config.num_epochs:

        model.train()

        for batch_id, iter_i in enumerate(train_data_iter):
            user_review, item_review, rating = iter_i
            user_review = user_review.to(config.device)
            item_review = item_review.to(config.device)
            rating = rating.to(config.device)
            opt.zero_grad()
            predict = model(user_review, item_review)
            li = loss(predict, rating)
            li.backward()
            opt.step()

            # log progress
            current_batches = model.current_epoch * batches_num + (batch_id + 1.0)
            total_batches = config.num_epochs * batches_num
            progress = current_batches / total_batches
            if progress - last_progress > 0.001:
                logger.debug("epoch %d, batch %d, loss: %f (%.2f%%)" %
                             (model.current_epoch, batch_id, li.item(), 100.0 * progress))
                last_progress = progress

        # complete one epoch
        train_loss = eval_model(model, train_data_iter, loss)
        dev_loss = eval_model(model, dev_data_iter, loss)
        logger.info("Epoch %d complete. Total loss(train/dev)=%f/%f"
                    % (model.current_epoch, train_loss, dev_loss))

        # save best model
        if train_loss < last_loss:
            last_loss = train_loss
            save_model(model, train_time)

        lr_s.step(model.current_epoch)
        model.current_epoch += 1

    logger.info("%s trained!" % model_name)
    remove_log_file(logger)
예제 #7
0
def get_max_review_count(data: DataFrame, percentile: float = 0.85):
    """
    We set the max review count to 85% percentile of all data as default.
    """

    review_count_user = data["review"].groupby([data["userID"]]).count()
    review_count_user = int(
        review_count_user.quantile(percentile, interpolation="lower"))

    review_count_item = data["review"].groupby([data["itemID"]]).count()
    review_count_item = int(
        review_count_item.quantile(percentile, interpolation="lower"))

    max_count = max(review_count_item, review_count_user)
    logger.info(f"Max review count = {max_count}.")
    return max_count
예제 #8
0
def get_data_loader(data: DataFrame, config: BaseConfig):
    logger.info("Generating data iter...")
    review_by_user, review_by_item = get_review_dict()

    user_reviews = [torch.LongTensor(load_reviews(review_by_user, userID, itemID, config.max_review_length))
                    for userID, itemID in zip(data["userID"], data["itemID"])]
    user_reviews = torch.stack(user_reviews)

    item_reviews = [torch.LongTensor(load_reviews(review_by_item, itemID, userID, config.max_review_length))
                    for userID, itemID in zip(data["userID"], data["itemID"])]
    item_reviews = torch.stack(item_reviews)

    ratings = torch.Tensor(data["rating"].to_list()).view(-1, 1)

    dataset = torch.utils.data.TensorDataset(user_reviews, item_reviews, ratings)
    pin_memory = config.device not in ["cpu", "CPU"]
    data_iter = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True, pin_memory=pin_memory)
    logger.info("Data iter loaded.")
    return data_iter
예제 #9
0
def process_raw_data(in_path="data/Digital_Music_5.json",
                     out_path="data/reviews.json"):
    """
    Read raw data and remove useless columns and clear review text.
    Then save the result to file system.
    """

    logger.info("reading raw data...")
    df = pandas.read_json(ROOT_DIR.joinpath(in_path), lines=True)
    df = df[["reviewerID", "asin", "reviewText", "overall"]]
    df.columns = ["userID", "itemID", "review", "rating"]
    stop_words = get_stop_words()
    punctuations = get_punctuations()
    lemmatizer = nltk.WordNetLemmatizer()

    def clean_review(review: str):
        review = review.lower()
        assert "'" not in punctuations
        for p in punctuations:
            review = review.replace(p, " ")
        tokens = review.split()
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(tokens)

    logger.info("cleaning review text...")
    df["review"] = df["review"].apply(clean_review)
    df.to_json(ROOT_DIR.joinpath(out_path), orient="records", lines=True)
    logger.info("Processed data saved.")
예제 #10
0
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'):
    """
    Read pre-trained word embedding model, and add "<pad>" to it with zero weight.
    """

    logger.info("loading word2vec model...")
    path = ROOT_DIR.joinpath(path)
    word_vec = KeyedVectors.load_word2vec_format(path, binary=True)

    if PAD_WORD not in word_vec:
        word_vec.add([PAD_WORD], np.zeros([1, 300]))
        logger.info(f"Add PAD_WORD to word embedding.")

    assert PAD_WORD_ID == word_vec.vocab[PAD_WORD].index, \
        f"PAD_WORD_ID should be {word_vec.vocab[PAD_WORD].index} but not {PAD_WORD_ID}."

    logger.info("word2vec model loaded.")
    return word_vec
예제 #11
0
def train_model(model: BaseModel,
                train_data: DataFrame,
                dev_data: DataFrame,
                is_save_model: bool = True):
    model_name = model.__class__.__name__
    train_time = time.localtime()
    add_log_file(
        logger, "log/%s_%s.log" %
        (model_name, time.strftime("%Y%m%d%H%M%S", train_time)))
    logger.info("Training %s..." % model_name)

    config: BaseConfig = model.config
    logger.info(config.__dict__)
    model.to(config.device)

    opt = torch.optim.Adam(model.parameters(),
                           config.learning_rate,
                           weight_decay=config.l2_regularization)
    lr_s = lr_scheduler.ExponentialLR(opt, gamma=config.learning_rate_decay)
    loss = torch.nn.MSELoss()

    last_progress = 0.
    min_loss = float("inf")

    pin_memory = config.device not in ["cpu", "CPU"]
    review_by_user, review_by_item = get_review_dict("train")
    dataset = NarreDataset(train_data, review_by_user, review_by_item, config)
    train_data_iter = DataLoader(dataset,
                                 batch_size=config.batch_size,
                                 shuffle=True,
                                 pin_memory=pin_memory)
    dataset = NarreDataset(dev_data, review_by_user, review_by_item, config)
    dev_data_iter = DataLoader(dataset,
                               batch_size=config.batch_size,
                               shuffle=True,
                               pin_memory=pin_memory)

    batches_num = math.ceil(len(train_data) / config.batch_size)
    while model.current_epoch < config.num_epochs:

        model.train()

        for batch_id, iter_i in enumerate(train_data_iter):
            user_review, user_id, item_id_per_review, item_review, item_id, user_id_per_review, rating = iter_i

            user_review = user_review.to(config.device)
            user_id = user_id.to(config.device)
            item_id_per_review = item_id_per_review.to(config.device)

            item_review = item_review.to(config.device)
            item_id = item_id.to(config.device)
            user_id_per_review = user_id_per_review.to(config.device)

            rating = rating.to(config.device)

            opt.zero_grad()
            predict = model(user_review, user_id, item_id_per_review,
                            item_review, item_id, user_id_per_review)
            li = loss(predict, rating)
            li.backward()
            opt.step()

            # log progress
            current_batches = model.current_epoch * batches_num + batch_id + 1
            total_batches = config.num_epochs * batches_num
            progress = current_batches / total_batches
            if progress - last_progress > 0.001:
                logger.debug(
                    "epoch %d, batch %d, loss: %f (%.2f%%)" %
                    (model.current_epoch, batch_id, li.item(), 100 * progress))
                last_progress = progress

        # complete one epoch
        train_loss = eval_model(model, train_data_iter, loss)
        dev_loss = eval_model(model, dev_data_iter, loss)
        logger.info("Epoch %d complete. Total loss(train/dev)=%f/%f" %
                    (model.current_epoch, train_loss, dev_loss))

        # save best model
        if train_loss < min_loss:
            min_loss = train_loss
            logger.info(f"Get min loss: {train_loss}")
            if is_save_model:
                save_model(model, train_time)

        lr_s.step(model.current_epoch)
        model.current_epoch += 1

    logger.info("%s trained!" % model_name)
    remove_log_file(logger)
    return min_loss
예제 #12
0
import torch
from torch.utils.data import DataLoader

from utils.data_reader import get_train_dev_test_data, get_review_dict
from utils.data_set import NarreDataset
from utils.log_hepler import logger
from utils.train_helper import load_model, eval_model

train_data, dev_data, test_data = get_train_dev_test_data()
model = load_model("model/checkpoints/NarreModel_20200606153827.pt")
model.config.device = "cuda:1"
model.to(model.config.device)
loss = torch.nn.MSELoss()

review_by_user, review_by_item = get_review_dict("test")
dataset = NarreDataset(test_data, review_by_user, review_by_item, model.config)
data_iter = DataLoader(dataset,
                       batch_size=model.config.batch_size,
                       shuffle=True)

logger.info(f"Loss on test dataset: {eval_model(model, data_iter, loss)}")
예제 #13
0
             "wb"))


def get_review_dict(data_type: str):
    user_review = pickle.load(
        open(ROOT_DIR.joinpath(f"data/user_review_word_idx_{data_type}.p"),
             "rb"))
    item_review = pickle.load(
        open(ROOT_DIR.joinpath(f"data/item_review_word_idx_{data_type}.p"),
             "rb"))
    return user_review, item_review


if __name__ == "__main__":
    process_raw_data()
    logger.info(f"Max user id = {get_max_user_id()}")
    logger.info(f"Max item id = {get_max_item_id()}")

    train_data, dev_data, test_data = get_train_dev_test_data()
    known_data = pandas.concat([train_data, dev_data])
    all_data = pandas.concat([train_data, dev_data, test_data])

    logger.info(f"Max review length = {get_max_review_length(all_data)}")
    logger.info(f"Max review count = {get_max_review_count(all_data)}")

    word_vec = get_word_vec()
    save_embedding_weights(word_vec)

    save_review_dict(known_data, word_vec, "train")
    save_review_dict(all_data, word_vec, "test")
예제 #14
0
        ids = ids[:config.review_count]
        # pad review count
        pad_length = config.review_count - len(reviews)
        pad_review = [config.pad_word_id] * config.review_length
        reviews += [pad_review] * pad_length
        ids += [pad_id] * pad_length

        return reviews, ids


if __name__ == '__main__':
    train_data, _, _ = get_train_dev_test_data()
    review_by_user, review_by_item = get_review_dict("train")

    config = NarreConfig(review_length=50,
                         review_count=20,
                         pad_word_id=PAD_WORD_ID,
                         pad_item_id=get_max_item_id() + 1,
                         pad_user_id=get_max_user_id() + 1)

    dataset = NarreDataset(train_data, review_by_user, review_by_item, config)
    loader = DataLoader(dataset, batch_size=128, shuffle=True)
    for user_review, user_id, item_id_per_review, item_review, item_id, user_id_per_review, rating in loader:
        logger.info(f"{user_review.shape}, "
                    f"{user_id.shape}"
                    f"{item_id_per_review.shape}, "
                    f"{item_review.shape}, "
                    f"{item_id.shape}"
                    f"{user_id_per_review.shape}, "
                    f"{rating.shape}")