예제 #1
0
def random_algorithm(user, candidate_news, num_timestep=1000, top_N=1):
    '''
    Random algorithm
    '''
    history = {}
    history["clicked"] = []
    history["shown"] = []

    candidate_pks = set(list(candidate_news.keys()))

    for _ in range(num_timestep):

        top_N_pks = random_select(candidate_pks, top_N)

        selected_news = candidate_news[top_N_pks[0]]

        flag = user_interaction(user, selected_news, ranked=False)

        if flag:
            history["clicked"].append(selected_news)

        history["shown"].append(selected_news)

        candidate_pks.remove(selected_news["article_id"])

    return history
예제 #2
0
def oracle_algorithm(user, candidate_news, num_timestep=1000, top_N=1):
    '''
    Oracle algorithm
    '''
    history = {}
    history["clicked"] = []
    history["shown"] = []

    candidate_pks = set(list(candidate_news.keys()))

    # In here, it is gonna calculate the top ones at one time.
    # Then sorted by descending order.

    topn_pks = highest(user, candidate_news, num_timestep)

    for num_ite in range(num_timestep):

        top_N_pk = topn_pks[num_ite]

        selected_news = candidate_news[top_N_pk]

        flag = user_interaction(user, selected_news, ranked=False)

        if flag:
            history["clicked"].append(selected_news)

        history["shown"].append(selected_news)

        candidate_pks.remove(selected_news["article_id"])

    return history
예제 #3
0
def random_bootstrap_v2(user_vector,
                        issue_mapping,
                        candidate_news,
                        budget,
                        seed=42):
    '''
    Randomly choose budget articles
    '''
    pos_pks = []
    neg_pks = []

    pks = list(candidate_news.keys())
    np.random.shuffle(pks)

    for pk in pks[:budget]:

        s = user_interaction(user_vector, candidate_news[pk])
        if s:
            pos_pks.append(pk)
        else:
            neg_pks.append(pk)

    return pos_pks, neg_pks
예제 #4
0
def cbnf_algorithm(user_vector,
                   candidate_news,
                   ran_idx,
                   partisan_weights,
                   randomness=0.0,
                   bootstrap=700,
                   num_rec=1000):
    '''
    content-based algorithm
    '''
    np.random.seed(ran_idx)

    history = {}
    issue_mapping = get_issue_mapping(candidate_news)

    pos_pks, neg_pks = random_bootstrap(user_vector, issue_mapping,
                                        candidate_news, bootstrap)

    unlabeled_pool = set(candidate_news.keys())
    unlabeled_pool.difference_update(pos_pks)
    unlabeled_pool.difference_update(neg_pks)

    history["bootstrap_pos"] = pos_pks.copy()
    history["bootstrap_neg"] = neg_pks.copy()
    history["clicked"] = []
    history["shown"] = []

    x_train, y_train, w_train = build_train(candidate_news, pos_pks, neg_pks,
                                            partisan_weights)
    x_test, x_test_pk = build_test(candidate_news, unlabeled_pool)

    class_weights = compute_class_weight('balanced',
                                         classes=np.unique(y_train),
                                         y=y_train)

    class_weights = {0: class_weights[0], 1: class_weights[1]}

    cls = SGDClassifier(loss="log", class_weight=class_weights)
    cls.fit(x_train, y_train, sample_weight=w_train)

    # Start to recommend
    for _ in range(num_rec):

        if np.random.rand() < randomness:
            top_news_idx = np.random.randint(0, len(x_test_pk))
        else:
            y_prob = cls.predict_proba(x_test)[:, 1]
            top_news_idx = np.argsort(y_prob)[::-1][0]

        top_news_pk = x_test_pk[top_news_idx]
        top_news = candidate_news[x_test_pk[top_news_idx]]

        unlabeled_pool.remove(top_news_pk)

        flag = user_interaction(user_vector, top_news, ranked=True)

        newx = [candidate_news[top_news_pk]["feature_vector"]]
        newx = np.array(newx)

        if flag:
            newy = [1]
            pos_pks.append(top_news_pk)
            history["clicked"].append(top_news)
        else:
            newy = [0]
            neg_pks.append(top_news_pk)

        x_test = np.delete(x_test, top_news_idx, 0)
        x_test_pk.pop(top_news_idx)

        neww = [
            partisan_weights[candidate_news[top_news_pk]
                             ['source_partisan_score']]
        ]

        cls.partial_fit(newx, newy, sample_weight=neww)

        history["shown"].append(top_news)

    history['clf'] = cls

    return history
예제 #5
0
def random_bootstrap(user_vector,
                     issue_mapping,
                     candidate_news,
                     budget,
                     seed=42):
    '''
    Randomly choose the equal number of articles from each topic
    '''
    chosen_topic = [
        "abortion",
        "environment",
        "guns",
        "health care",
        "immigration",
        "LGBTQ",
        "racism",
        "taxes",
        "technology",
        "trade",
        "trump impeachment",
        "us military",
        "us 2020 election",
        "welfare",
    ]

    l_topic = len(chosen_topic)

    size_list = [budget // l_topic for i in range(l_topic - 1)]
    size_list.append(budget - np.sum(size_list))

    topic_mapping = {}

    for topic, values in issue_mapping.items():

        topic_pk_list = []

        for lean, pks in values.items():
            topic_pk_list.extend(pks)

        topic_mapping[topic] = topic_pk_list

    pos_pks = []
    neg_pks = []

    overall = set()

    for size, topic in zip(size_list, chosen_topic):

        bucket = set()

        candidate_pks = topic_mapping[topic]
        np.random.shuffle(candidate_pks)

        while len(bucket) < size:

            pk = candidate_pks.pop()

            if pk not in overall:
                s = user_interaction(user_vector, candidate_news[pk])
                if s:
                    pos_pks.append(pk)
                else:
                    neg_pks.append(pk)

                bucket.add(pk)

        overall.add(pk)

    return pos_pks, neg_pks
def cf_algorithm(users,
                 candidate_news,
                 random_seed,
                 randomness,
                 hidden_dim=40,
                 bootstrap_size=700,
                 num_timestep=1000):
    '''
    collaborative-filtering algorithm
    '''
    users_clicked_pool = {}

    issue_mapping = get_issue_mapping(candidate_news)

    # Create a dict from pk to column index
    # and create a dict from column index to pk
    pk2col = {}
    col2pk = {}
    for idx, pk in enumerate(list(candidate_news.keys())):
        pk2col[pk] = idx
        col2pk[idx] = pk

    history = {}
    for idx in range(len(users)):
        history[idx] = {}
        history[idx]['prototype'] = users[idx]['ptt']
        history[idx]['shown'] = []
        history[idx]['clicked'] = []

    matrix = np.zeros((len(users), len(candidate_news)))

    # Initialize the bootstrap for each user
    for idx, user in sorted(users.items()):

        users_clicked_pool[idx] = set()

        pos_pks, neg_pks = random_bootstrap(user['vec'], issue_mapping,
                                            candidate_news, bootstrap_size)

        users_clicked_pool[idx].update(pos_pks)
        users_clicked_pool[idx].update(neg_pks)

        for pk in pos_pks:
            matrix[idx, pk2col[pk]] = 1

    # Start to recommend, first test nmf
    # Add randomness parameters in here
    for _ in range(num_timestep):

        W, H, _ = non_negative_factorization(matrix,
                                             n_components=hidden_dim,
                                             init='random',
                                             random_state=random_seed,
                                             max_iter=250)
        new_matrix = np.matmul(W, H)

        for idx, row in enumerate(new_matrix):

            user_vector = users[idx]['vec']

            if np.random.rand() < randomness:

                keys = list(candidate_news.keys())
                np.random.shuffle(keys)

                for pk in keys:
                    if pk not in users_clicked_pool[idx]:
                        candidate_pk = pk
                        break
            else:
                indices = np.argsort(row)[::-1]

                for col_idx in indices:
                    if col2pk[col_idx] not in users_clicked_pool[idx]:
                        candidate_pk = col2pk[col_idx]
                        break

            # Remove from the pool
            users_clicked_pool[idx].add(candidate_pk)

            top_news = candidate_news[candidate_pk]
            flag = user_interaction(user_vector, top_news, ranked=True)

            top_col_idx = pk2col[candidate_pk]

            if flag:
                matrix[idx, top_col_idx] = 1
                history[idx]['clicked'].append(top_news)
            else:
                assert matrix[idx, top_col_idx] == 0

            history[idx]['shown'].append(top_news)

    return history