Пример #1
0
def parse_dataset(papers_feature_window, feature_begin, feature_end, counter):
    logging.info('parsing dataset ...')
    write = []
    cite = []
    include = []
    published = []

    left_gt = Paper(feature_begin)
    right_le = Paper(feature_end)
    left_index = bisect.bisect_right(papers_feature_window, left_gt)
    right_index = bisect.bisect_right(papers_feature_window, right_le)

    for p in papers_feature_window[left_index:right_index]:
        for author_id in p.authors:
            write.append((author_id, p.id))
        for paper_id in p.references:
            cite.append((paper_id, p.id))
        for term_id in p.terms:
            include.append((p.id, term_id))
        published.append((p.id, p.venue))

    num_authors = counter['author']
    num_papers = counter['paper']
    num_venues = counter['venue']
    num_terms = counter['term']

    W = create_sparse(write, num_authors, num_papers)
    C = create_sparse(cite, num_papers, num_papers)
    I = create_sparse(include, num_papers, num_terms)
    P = create_sparse(published, num_papers, num_venues)

    return W, C, I, P
Пример #2
0
def parse_dataset(usr_dataset, usr_bm_tg, feature_begin, feature_end, indexer):
    logging.info('parsing dataset ...')
    contact = []
    save = []
    attach = []

    # while parsing the users dataset we extract the contact relationships
    #  occurring between users in the feature extraction window
    for line in usr_dataset[
            1:]:  # skipping the first line (header) of the dataset
        line_items = line.split('\t')
        contact_timestamp = float(line_items[2]) / 1000

        if feature_begin < contact_timestamp <= feature_end:
            user1, user2 = (indexer.get_index('user', line_items[i])
                            for i in range(2))
            contact.append((user1, user2))

    # while parsing the user_tag_bookmark dataset we extract the relationships
    #  occurring between these entities in the feature extraction window
    for line in usr_bm_tg[1:]:
        line_items = line.split('\t')
        assign_time = float(line_items[3]) / 1000

        if feature_begin < assign_time <= feature_end:
            user = indexer.get_index('user', line_items[0])
            bookmark = indexer.get_index('bookmark', line_items[1])
            tag = indexer.get_index('tag', line_items[2])
            save.append((user, bookmark))
            attach.append((tag, bookmark))

    num_usr = indexer.indices['user']
    num_tag = indexer.indices['tag']
    num_bookmark = indexer.indices['bookmark']

    contact_sparse = create_sparse(contact, num_usr, num_usr)
    save_sparse = create_sparse(save, num_usr, num_bookmark)
    attach_sparse = create_sparse(attach, num_tag, num_bookmark)

    return contact_sparse, save_sparse, attach_sparse
Пример #3
0
def parse_dataset(user_rates_movies_ds, user_tags_movies_ds, movie_actor_ds,
                  movie_director_ds, movie_genre_ds, movie_countries_ds,
                  feature_begin, feature_end, indexer):
    logging.info('parsing dataset ...')
    rate = []
    # assign = []
    attach = []
    played_by = []
    directed_by = []
    has = []
    produced_in = []

    # while parsing the users dataset we extract the contact relationships
    #  occurring between users in the feature extraction window
    for line in user_rates_movies_ds[
            1:]:  # skipping the first line (header) of the dataset
        line_items = line.split('\t')
        # the timestamp int he dataset is represented with miliseconds, so
        # we eliminate the last 3 charactars
        rating = float(line_items[2])
        rating_timestamp = float(line_items[3]) / 1000
        if feature_begin < rating_timestamp <= feature_end and rating > rating_threshold:
            user = indexer.get_index('user', line_items[0])
            movie = indexer.get_index('movie', line_items[1])
            rate.append((user, movie))

    # while parsing the user_tag_bookmark dataset we extract the relationships
    #  occurring between these entities in the feature extraction window
    for line in user_tags_movies_ds[1:]:
        line_items = line.split('\t')
        assign_time = float(line_items[3]) / 1000
        if feature_begin < assign_time <= feature_end:
            # user = indexer.get_index('user', line_items[0])
            movie = indexer.get_index('movie', line_items[1])
            tag = indexer.get_index('tag', line_items[2])
            # assign.append((user, tag))
            attach.append((tag, movie))

    for line in movie_actor_ds[1:]:
        line_items = line.split('\t')
        ranking = int(line_items[3])
        if ranking < actor_threshold:
            movie = indexer.get_index('movie', line_items[0])
            actor = indexer.get_index('actor', line_items[1])
            if not (movie is None or actor is None):
                played_by.append((movie, actor))

    for line in movie_director_ds[1:]:
        line_items = line.split('\t')
        movie = indexer.get_index('movie', line_items[0])
        director = indexer.get_index('director', line_items[1])
        if not (movie is None or director is None):
            directed_by.append((movie, director))

    for line in movie_genre_ds[1:]:
        line_items = line.split('\t')
        movie = indexer.get_index('movie', line_items[0])
        genre = indexer.get_index('genre', line_items[1])
        if not (movie is None or genre is None):
            has.append((movie, genre))

    for line in movie_countries_ds[1:]:
        line_items = line.split('\t')
        movie = indexer.get_index('movie', line_items[0])
        country = indexer.get_index('country', line_items[1])
        if not (movie is None or country is None):
            produced_in.append((movie, country))

    num_usr = indexer.indices['user']
    num_tag = indexer.indices['tag']
    num_movie = indexer.indices['movie']
    num_actor = indexer.indices['actor']
    num_directors = indexer.indices['director']
    num_genre = indexer.indices['genre']
    num_countries = indexer.indices['country']

    rate_sparse = create_sparse(rate, num_usr, num_movie)
    # assign_sparse = create_sparse(assign, num_usr, num_tag)
    attach_sparse = create_sparse(attach, num_tag, num_movie)
    played_by_sparse = create_sparse(played_by, num_movie, num_actor)
    directed_by_sparse = create_sparse(directed_by, num_movie, num_directors)
    has_genre_sparse = create_sparse(has, num_movie, num_genre)
    produced_in_sparse = create_sparse(produced_in, num_movie, num_countries)

    return rate_sparse, attach_sparse, played_by_sparse, directed_by_sparse, has_genre_sparse, produced_in_sparse