示例#1
0
def read_user_features(about, user_ids, user_id_mapping):

    features = Features()
    features.item_ids = user_id_mapping

    if user_ids:
        for uid in user_id_mapping:
            features.add_feature(uid, 'user_id:' + uid)
        
    if about:
        # Add intercepts
        for uid in user_id_mapping:
            features.add_feature(uid, 'intercept')

        for i, (user_id, about_me) in enumerate(read_user_data()):

            clean_about = (strip_tags(about_me)
                           .replace('\n', ' ')
                           .lower())
            clean_about = _process_post_body(clean_about)

            for token in clean_about:
                features.add_feature(user_id, 'about:' + token)

    features.set_shape()
                       
    return features
示例#2
0
def read_user_features(about, user_ids, user_id_mapping):

    features = Features()
    features.item_ids = user_id_mapping

    if user_ids:
        for uid in user_id_mapping:
            features.add_feature(uid, 'user_id:' + uid)

    if about:
        # Add intercepts
        for uid in user_id_mapping:
            features.add_feature(uid, 'intercept')

        for i, (user_id, about_me) in enumerate(read_user_data()):

            clean_about = (strip_tags(about_me).replace('\n', ' ').lower())
            clean_about = _process_post_body(clean_about)

            for token in clean_about:
                features.add_feature(user_id, 'about:' + token)

    features.set_shape()

    return features
示例#3
0
def read_post_features(tags, post_ids, post_text):

    features = Features()

    for post_id, user_id, post_tags, parent_post_id, body in read_post_data():
        # Only get features for questions, not answers.
        if parent_post_id is None:

            features.add_item(post_id)

            if post_ids:
                features.add_feature(post_id, 'post_id:' + post_id)

            if tags:
                for tag in post_tags:
                    features.add_feature(post_id, 'tag:' + tag)

            if post_text:
                for token in body:
                    features.add_feature(post_id, 'body:' + token)

    features.set_shape()

    return features
示例#4
0
def read_post_features(tags, post_ids, post_text):

    features = Features()

    for post_id, user_id, post_tags, parent_post_id, body in read_post_data():
        # Only get features for questions, not answers.
        if parent_post_id is None:

            features.add_item(post_id)

            if post_ids:
                features.add_feature(post_id, 'post_id:' + post_id)

            if tags:
                for tag in post_tags:
                    features.add_feature(post_id, 'tag:' + tag)

            if post_text:
                for token in body:
                    features.add_feature(post_id, 'body:' + token)

    features.set_shape()

    return features
示例#5
0
def read_movie_features(titles=False, genres=False, genome_tag_threshold=1.0, tag_popularity_threshold=30):

    features = Features()

    with open(os.path.join(DATA_DIR, "movies.dat"), "r") as moviefile:
        for line in moviefile:
            (iid, title, genre_list) = line.split(SEPARATOR)
            genres_list = genre_list.split("|")

            features.add_item(iid)

            if genres:
                for genre in genres_list:
                    features.add_feature(iid, "genre:" + genre.lower().replace("\n", ""))

            if titles:
                features.add_feature(iid, "title:" + title.lower())

            features.add_title(iid, title)

    for iid, tag, relevance in read_genome_tags():
        # Do not include any tags for movies not in the 10M dataset
        if relevance >= genome_tag_threshold and iid in features.item_ids:
            features.add_feature(iid, "genome:" + tag.lower())

    # Tags applied by users
    ## for iid, tag, count in read_tags():
    ##     if count >= tag_popularity_threshold and iid in features.item_ids:
    ##         features.add_feature(iid, 'tag:' + tag)

    features.set_shape()

    return features
示例#6
0
def read_movie_features(titles=False,
                        genres=False,
                        genome_tag_threshold=1.0,
                        tag_popularity_threshold=30):

    features = Features()

    with open(os.path.join(DATA_DIR, 'movies.dat'), 'r') as moviefile:
        for line in moviefile:
            (iid, title, genre_list) = line.split(SEPARATOR)
            genres_list = genre_list.split('|')

            features.add_item(iid)

            if genres:
                for genre in genres_list:
                    features.add_feature(
                        iid, 'genre:' + genre.lower().replace('\n', ''))

            if titles:
                features.add_feature(iid, 'title:' + title.lower())

            features.add_title(iid, title)

    for iid, tag, relevance in read_genome_tags():
        # Do not include any tags for movies not in the 10M dataset
        if relevance >= genome_tag_threshold and iid in features.item_ids:
            features.add_feature(iid, 'genome:' + tag.lower())

    # Tags applied by users
    ## for iid, tag, count in read_tags():
    ##     if count >= tag_popularity_threshold and iid in features.item_ids:
    ##         features.add_feature(iid, 'tag:' + tag)

    features.set_shape()

    return features