示例#1
0
def user_rep_to_answers_and_questions(k, user_lines, posts_lines):
    reputation = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    posts = posts_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    questions = posts \
        .filter(lambda a: a[1] == 1) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    answers = posts \
        .filter(lambda a: a[1] == 2) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (user_id, (answer_count, question_count))
    ratio = answers.join(questions)

    # (user_id, (user_rep, (answer_count, question_count)))
    result = reputation.join(ratio)

    # (user_rep, answer_count, question_count)
    flat_result = result.map(lambda a: (a[1][0], a[1][1][0], a[1][1][1]))

    return KMeans(k).fit(flat_result)
示例#2
0
def user__reputation__to__own_questions_answered(k, user_lines, post_lines):
    # (user_id, rep)
    user_id_reputation = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    # (user_id, n_questions)
    user_id_questions = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 3) and a[2] == 1) \
        .map(lambda a: (a[0], a[1]))

    # (user_id, n_asked)
    user_id_answers = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['ParentId', 'OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 3) and a[2] == 2) \
        .map(lambda a: (a[0], a[1]))

    # (user_id, n_questions_self_answered)
    user_id_own_answers = user_id_questions.intersection(user_id_answers) \
        .map(lambda a: (a[1], 1)) \
        .reduceByKey(add)

    # (rep, n_questions_self_answered)
    result = user_id_reputation.join(user_id_own_answers).map(
        lambda a: (a[1][0], a[1][1]))

    return KMeans(k).fit(result)
示例#3
0
def user__upvotes_cast__to__average_post_length__to__profile_views(
        k, users, posts):
    """
    Classifies users based on the following:
        - amount of upvotes a user has cast
        - user's average post length (question and answer)
        - user's profile views

    :param k: Number of clusters
    :param users: RDD with XML file with users
    :param posts: RDD with XML file with posts
    :return: RDD of clustered data
    """
    # (user_id, average_post_length)
    user_avg_post_length = posts\
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Body']))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2))\
        .map(lambda data: (int(data[0]), len(data[1])))\
        .aggregateByKey((0, 0), lambda a, b: (a[0] + b, a[1] + 1), lambda a, b: (a[0] + b[0], a[1] + b[1]))\
        .mapValues(lambda value: value[0] / value[1])

    # (id, (upvotes_cast, profile_views))
    user_data = users\
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'UpVotes', 'Views'], int))\
        .filter(lambda a: helpers.is_valid_tuple(a, 3))\
        .map(lambda data: (data[0], (data[1], data[2])))

    # (upvotes_cast, views, average_post_length)
    joined_data = user_data.join(user_avg_post_length)\
        .map(lambda data: (data[1][0][0], data[1][0][1], data[1][1]))

    return KMeans(k).fit(joined_data)
示例#4
0
def user_questions_answered(k, posts_lines):
    result = posts_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] == 2) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add) \
        .map(lambda a: (a[1],))

    return KMeans(k).fit(result)
示例#5
0
def user__reputation__to__distinct_post_tags(k, user_lines, post_lines):
    # (user_id, reputation)
    rep = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    # (user_id, number_distinct_tags)
    user_id_tags = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Tags'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), a[1].replace(">", "")[1:])) \
        .map(lambda a: (a[0], a[1].split("<"))) \
        .flatMapValues(lambda a: a) \
        .distinct() \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (rep, number_distinct_tags)
    result = rep.join(user_id_tags).map(lambda a: (a[1][0], a[1][1]))

    return KMeans(k).fit(result)
示例#6
0
def user__badges__to__signup__to__answers_and_questions(
        k, user_lines, badges_lines, posts_lines):
    # (user_id, signup)
    user_id_signup = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), helpers.datetime_to_timestamp(a[1])))

    # (user_id, number_badges)
    user_id_badges = badges_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['UserId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 1)) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (user_id, post_type)
    posts = posts_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'PostTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \

    # (user_id, n_answers)
    user_id_answers = posts \
        .filter(lambda a: a[1] == 2) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (user_id, n_asked)
    user_id_questions = posts \
        .filter(lambda a: a[1] == 1) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (n_questions, n_answers, n_badges, signup)
    result = user_id_signup.join(user_id_badges)
    result = result.join(user_id_answers)
    result = result.join(user_id_questions)
    result = result.map(lambda a:
                        (a[1][1], a[1][0][1], a[1][0][0][1], a[1][0][0][0]))

    return KMeans(k).fit(result)
示例#7
0
def user__signup__to__distinct_post_tags(k, user_lines, post_lines):
    # (user_id, signup_as_timestamp)
    creation_date = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), helpers.datetime_to_timestamp(a[1])))

    # (user_id, number_distinct_tags)
    user_id_tags = post_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['OwnerUserId', 'Tags'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), a[1].replace(">", "")[1:])) \
        .map(lambda a: (a[0], a[1].split("<"))) \
        .flatMapValues(lambda a: a) \
        .distinct() \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    # (signup_as_timestamp, number_distinct_tags)
    result = creation_date.join(user_id_tags).map(lambda a: (a[1][0], a[1][1]))

    return KMeans(k).fit(result)
示例#8
0
def user__membership_time__to__closed_questions(k, users, posts, post_history):
    """
    Classifies users based on the following:
        - amount of time a user has been a member
        - number of close or delete votes any of their posts have received

    :param k: Number of clusters
    :param users: RDD with XML file with users
    :param posts: RDD with XML file with posts
    :param post_history: RDD with XML file with post history
    :return: RDD of clustered data
    """
    # (user_id, timestamp)
    user_data = users\
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'CreationDate']))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2))\
        .map(lambda data: (int(data[0]), helpers.datetime_to_timestamp(data[1])))\

    # (post_id, author_user_id)
    post_data = posts\
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'OwnerUserId'], int))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    # (post_id, number_of_close_and_delete_votes)
    post_history_data = post_history\
        .map(lambda line: xml_parser.extract_attributes(line, ['PostId', 'PostHistoryTypeId'], int))\
        .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] in [10, 12])\
        .map(lambda a: (a[0], 1))\
        .reduceByKey(add)

    # (user_id, number_of_close_and_delete_votes)
    user_delete_and_close_count = post_data.join(post_history_data)\
        .map(lambda a: a[1])\
        .reduceByKey(add)

    # (timestamp, number_of_close_and_delete_votes)
    data = user_data.join(user_delete_and_close_count).map(lambda a: a[1])

    return KMeans(k).fit(data)
示例#9
0
def post__edits__average__to__user_rep(k, user_lines, post_history_lines):
    """
    Classifies users based on the following:
        - number of times a user's post has been edited
        - user's reputation

    :param k: Number of clusters
    :param user_lines: RDD with XML file with users
    :param post_history_lines: RDD with XML file with post history
    :return: RDD of clustered data
    """
    reputations = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Id', 'Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    post_edits_average = post_history_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['UserId', 'PostHistoryTypeId'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2) and a[1] == 5) \
        .map(lambda a: (a[0], 1)) \
        .reduceByKey(add)

    result = reputations.join(post_edits_average).map(lambda value: value[1])

    return KMeans(k).fit(result)
示例#10
0
def user__reputation__to__upvotes_cast(k, user_lines):
    """
    Classifies users based on the following:
        - user reputation
        - user upvotes cast

    :param k: Number of clusters
    :param user_lines: PythonRDD containing the lines in the users XML file
    :return: RDD of clustered data
    """
    result = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Reputation', 'UpVotes'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2))

    return KMeans(k).fit(result)
示例#11
0
def length__aboutme__to__user_rep(k, user_lines):
    """
    Classifies users based on the following:
        - length of about me
        - user reputation

    :param k: Number of clusters
    :param user_lines: RDD with XML file with users
    :return: RDD of clustered data
    """
    result = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Reputation', 'AboutMe'])) \
        .filter(lambda a: helpers.is_valid_tuple(a, 2)) \
        .map(lambda a: (int(a[0]), len(a[1])))

    return KMeans(k).fit(result)
示例#12
0
 def test_extract_attributes(self):
     xml = '<user id="19" date="  2009-01-12T19:07:38.647  " name="Tassos  " />'
     values = xml_parser.extract_attributes(xml, ["date", "name"])
     self.assertEqual(values, ("2009-01-12T19:07:38.647", "Tassos"))
示例#13
0
 def test_extract_attributes_casts_values(self):
     xml = '<user id="19" rep="90050" year="2017" />'
     values = xml_parser.extract_attributes(xml, ["id", "rep", "year"], int)
     self.assertEqual(values, (19, 90050, 2017))
示例#14
0
def user_downvotes_cast(k, user_lines):
    result = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['DownVotes'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 1))

    return KMeans(k).fit(result)
示例#15
0
def user_rep(k, user_lines):
    result = user_lines \
        .map(lambda line: xml_parser.extract_attributes(line, ['Reputation'], int)) \
        .filter(lambda a: helpers.is_valid_tuple(a, 1))

    return KMeans(k).fit(result)