def parse_dataset(papers_feature_window, feature_begin, feature_end, counter): logging.info('parsing dataset ...') write = [] cite = [] include = [] published = [] left_gt = Paper(feature_begin) right_le = Paper(feature_end) left_index = bisect.bisect_right(papers_feature_window, left_gt) right_index = bisect.bisect_right(papers_feature_window, right_le) for p in papers_feature_window[left_index:right_index]: for author_id in p.authors: write.append((author_id, p.id)) for paper_id in p.references: cite.append((paper_id, p.id)) for term_id in p.terms: include.append((p.id, term_id)) published.append((p.id, p.venue)) num_authors = counter['author'] num_papers = counter['paper'] num_venues = counter['venue'] num_terms = counter['term'] W = create_sparse(write, num_authors, num_papers) C = create_sparse(cite, num_papers, num_papers) I = create_sparse(include, num_papers, num_terms) P = create_sparse(published, num_papers, num_venues) return W, C, I, P
def parse_dataset(usr_dataset, usr_bm_tg, feature_begin, feature_end, indexer): logging.info('parsing dataset ...') contact = [] save = [] attach = [] # while parsing the users dataset we extract the contact relationships # occurring between users in the feature extraction window for line in usr_dataset[ 1:]: # skipping the first line (header) of the dataset line_items = line.split('\t') contact_timestamp = float(line_items[2]) / 1000 if feature_begin < contact_timestamp <= feature_end: user1, user2 = (indexer.get_index('user', line_items[i]) for i in range(2)) contact.append((user1, user2)) # while parsing the user_tag_bookmark dataset we extract the relationships # occurring between these entities in the feature extraction window for line in usr_bm_tg[1:]: line_items = line.split('\t') assign_time = float(line_items[3]) / 1000 if feature_begin < assign_time <= feature_end: user = indexer.get_index('user', line_items[0]) bookmark = indexer.get_index('bookmark', line_items[1]) tag = indexer.get_index('tag', line_items[2]) save.append((user, bookmark)) attach.append((tag, bookmark)) num_usr = indexer.indices['user'] num_tag = indexer.indices['tag'] num_bookmark = indexer.indices['bookmark'] contact_sparse = create_sparse(contact, num_usr, num_usr) save_sparse = create_sparse(save, num_usr, num_bookmark) attach_sparse = create_sparse(attach, num_tag, num_bookmark) return contact_sparse, save_sparse, attach_sparse
def parse_dataset(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds, movie_director_ds, movie_genre_ds, movie_countries_ds, feature_begin, feature_end, indexer): logging.info('parsing dataset ...') rate = [] # assign = [] attach = [] played_by = [] directed_by = [] has = [] produced_in = [] # while parsing the users dataset we extract the contact relationships # occurring between users in the feature extraction window for line in user_rates_movies_ds[ 1:]: # skipping the first line (header) of the dataset line_items = line.split('\t') # the timestamp int he dataset is represented with miliseconds, so # we eliminate the last 3 charactars rating = float(line_items[2]) rating_timestamp = float(line_items[3]) / 1000 if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold: user = indexer.get_index('user', line_items[0]) movie = indexer.get_index('movie', line_items[1]) rate.append((user, movie)) # while parsing the user_tag_bookmark dataset we extract the relationships # occurring between these entities in the feature extraction window for line in user_tags_movies_ds[1:]: line_items = line.split('\t') assign_time = float(line_items[3]) / 1000 if feature_begin < assign_time <= feature_end: # user = indexer.get_index('user', line_items[0]) movie = indexer.get_index('movie', line_items[1]) tag = indexer.get_index('tag', line_items[2]) # assign.append((user, tag)) attach.append((tag, movie)) for line in movie_actor_ds[1:]: line_items = line.split('\t') ranking = int(line_items[3]) if ranking < actor_threshold: movie = indexer.get_index('movie', line_items[0]) actor = indexer.get_index('actor', line_items[1]) if not (movie is None or actor is None): played_by.append((movie, actor)) for line in movie_director_ds[1:]: line_items = line.split('\t') movie = indexer.get_index('movie', line_items[0]) director = indexer.get_index('director', line_items[1]) if not (movie is None or director is None): directed_by.append((movie, director)) for line in movie_genre_ds[1:]: line_items = line.split('\t') movie = indexer.get_index('movie', line_items[0]) genre = indexer.get_index('genre', line_items[1]) if not (movie is None or genre is None): has.append((movie, genre)) for line in movie_countries_ds[1:]: line_items = line.split('\t') movie = indexer.get_index('movie', line_items[0]) country = indexer.get_index('country', line_items[1]) if not (movie is None or country is None): produced_in.append((movie, country)) num_usr = indexer.indices['user'] num_tag = indexer.indices['tag'] num_movie = indexer.indices['movie'] num_actor = indexer.indices['actor'] num_directors = indexer.indices['director'] num_genre = indexer.indices['genre'] num_countries = indexer.indices['country'] rate_sparse = create_sparse(rate, num_usr, num_movie) # assign_sparse = create_sparse(assign, num_usr, num_tag) attach_sparse = create_sparse(attach, num_tag, num_movie) played_by_sparse = create_sparse(played_by, num_movie, num_actor) directed_by_sparse = create_sparse(directed_by, num_movie, num_directors) has_genre_sparse = create_sparse(has, num_movie, num_genre) produced_in_sparse = create_sparse(produced_in, num_movie, num_countries) return rate_sparse, attach_sparse, played_by_sparse, directed_by_sparse, has_genre_sparse, produced_in_sparse