def read_user_features(about, user_ids, user_id_mapping): features = Features() features.item_ids = user_id_mapping if user_ids: for uid in user_id_mapping: features.add_feature(uid, 'user_id:' + uid) if about: # Add intercepts for uid in user_id_mapping: features.add_feature(uid, 'intercept') for i, (user_id, about_me) in enumerate(read_user_data()): clean_about = (strip_tags(about_me) .replace('\n', ' ') .lower()) clean_about = _process_post_body(clean_about) for token in clean_about: features.add_feature(user_id, 'about:' + token) features.set_shape() return features
def read_user_features(about, user_ids, user_id_mapping): features = Features() features.item_ids = user_id_mapping if user_ids: for uid in user_id_mapping: features.add_feature(uid, 'user_id:' + uid) if about: # Add intercepts for uid in user_id_mapping: features.add_feature(uid, 'intercept') for i, (user_id, about_me) in enumerate(read_user_data()): clean_about = (strip_tags(about_me).replace('\n', ' ').lower()) clean_about = _process_post_body(clean_about) for token in clean_about: features.add_feature(user_id, 'about:' + token) features.set_shape() return features
def read_post_features(tags, post_ids, post_text): features = Features() for post_id, user_id, post_tags, parent_post_id, body in read_post_data(): # Only get features for questions, not answers. if parent_post_id is None: features.add_item(post_id) if post_ids: features.add_feature(post_id, 'post_id:' + post_id) if tags: for tag in post_tags: features.add_feature(post_id, 'tag:' + tag) if post_text: for token in body: features.add_feature(post_id, 'body:' + token) features.set_shape() return features
def read_movie_features(titles=False, genres=False, genome_tag_threshold=1.0, tag_popularity_threshold=30): features = Features() with open(os.path.join(DATA_DIR, "movies.dat"), "r") as moviefile: for line in moviefile: (iid, title, genre_list) = line.split(SEPARATOR) genres_list = genre_list.split("|") features.add_item(iid) if genres: for genre in genres_list: features.add_feature(iid, "genre:" + genre.lower().replace("\n", "")) if titles: features.add_feature(iid, "title:" + title.lower()) features.add_title(iid, title) for iid, tag, relevance in read_genome_tags(): # Do not include any tags for movies not in the 10M dataset if relevance >= genome_tag_threshold and iid in features.item_ids: features.add_feature(iid, "genome:" + tag.lower()) # Tags applied by users ## for iid, tag, count in read_tags(): ## if count >= tag_popularity_threshold and iid in features.item_ids: ## features.add_feature(iid, 'tag:' + tag) features.set_shape() return features
def read_movie_features(titles=False, genres=False, genome_tag_threshold=1.0, tag_popularity_threshold=30): features = Features() with open(os.path.join(DATA_DIR, 'movies.dat'), 'r') as moviefile: for line in moviefile: (iid, title, genre_list) = line.split(SEPARATOR) genres_list = genre_list.split('|') features.add_item(iid) if genres: for genre in genres_list: features.add_feature( iid, 'genre:' + genre.lower().replace('\n', '')) if titles: features.add_feature(iid, 'title:' + title.lower()) features.add_title(iid, title) for iid, tag, relevance in read_genome_tags(): # Do not include any tags for movies not in the 10M dataset if relevance >= genome_tag_threshold and iid in features.item_ids: features.add_feature(iid, 'genome:' + tag.lower()) # Tags applied by users ## for iid, tag, count in read_tags(): ## if count >= tag_popularity_threshold and iid in features.item_ids: ## features.add_feature(iid, 'tag:' + tag) features.set_shape() return features