예제 #1
0
def make_list_of_games_to_scrape():
    '''
    Returns the list of files to work on scraping.
    '''

    # make a list of pending requests starting with a list of all games
    temp = GameIndexer()
    pending_game_list = temp.return_list_of_all_apps()

    # get list of completed games
    completed_games = get_completed_games()

    print
    print "STARTING SIZE OF PENDING GAME LIST:", len(pending_game_list)
    print

    # remove completed games from pending game list
    for game in completed_games:
        # ensure this game is in the list (weirdly occurs sometimes)
        count = pending_game_list.count(game)

        if count != 0:
            pending_game_list.pop(pending_game_list.index(game))

    print
    print "SCRUBBED SIZE OF PENDING GAME LIST:", len(pending_game_list)
    print

    return pending_game_list
예제 #2
0
def scrape_to_db(collection, app_id_list, count):
    '''
    Attempt to scrape <count> reviews from each game in the <app_list>
    and then try to add the resulting dictionary to the provided
    <collection>.
    '''

    # get list of apps/titles so we can populate the database with more data
    _gameindexer = GameIndexer()

    # step through each app in the list and try to scrape the reviews
    for app_id in app_id_list:

        # add try to make it more fault tolerant
        try:

            title = _gameindexer.return_game_title(app_id)

            # go get the game reviews
            game_results = get_game_reviews(app_id, 1150, title)

            insert(collection, game_results, "app_id")

        except Exception, e:
            error = "############################ Exception {} occurred! \n \
            ############################ Scrape of {} failed".format(
                e, app_id)

            print error

            with open("ERROR_selenium_game_review_scrape.txt", "w") as _file:
                _file.write(error)
예제 #3
0
    def print_filtered_predictions(self, train, test, num_items=10):
        '''
        Remove the matches that are also in the train set
        Add *** HIT *** to predictions that match test set
        Limit printing to num_items lines
        '''
        # keep track of how many lines have been printed
        printed = 0

        # used for resolving appind to title names
        lookup = GameIndexer()

        train_apps = train.pop("appind")

        train_apps = [int(x) for x in train_apps]

        test_apps = test.pop("appind")
        test_apps = [int(x) for x in test_apps]

        for idx, result in enumerate(self.sorted_predictions):
            title = lookup.game_index_to_title(int(result[1]), 40)
            appind = int(result[1])
            # print "\n\n**********************************************"
            # print "result", result[0], appind
            # print "train", sorted(list(train_apps))
            # print " test", sorted(list(test_apps))
            # print "**************************************************"

            if idx == 0:
                print "Rank:    Prediction:  Appind:      Title:"

            hit = ""

            # import ipdb; ipdb.set_trace()

            # if the appind is not in the list of train apps then print it
            # there's no point in printing things from the train set!
            if appind not in train_apps:

                # if we got a match on the test set then we should make that
                # more obvious
                if appind in test_apps:
                    hit = "<--- HIT *****"

                # make sure that we don't print more than we want
                printed += 1
                print "{:2d}       {:2.2f}           {:5d}      {:<40}         {}". \
                format(idx +1,
                        result[0],
                        int(result[1]),
                        title,
                        hit)

            if printed == num_items:
                break
예제 #4
0
    def print_sorted_predictions(self):
        """
        Print out the list for ease of testing
        """
        lookup = GameIndexer()

        for idx, result in enumerate(self.sorted_predictions):
            title = lookup.game_index_to_title(int(result[1]), 40)
            if idx == 0:
                print "Rank:    Prediction:  Appind:      Title:"
            print "{:2d}       {:2.2f}           {:5d}      {}".format(
                idx + 1, result[0], int(result[1]), title)
예제 #5
0
def update_completed_games(app_id):
    '''
    Append to the log file an app_id that has completed
    so that we don't waste time trying to scrape it again
    '''

    with open("games_we_have.txt", "a") as outfile:
        outfile.write(app_id + "\n")

    temp = GameIndexer()

    remaining = len(temp.return_list_of_all_apps()) - len(get_completed_games())
    print "added {} to games_we_have.txt with {} remaining.".format(app_id,
                                                            remaining)
def load_game_reviews_into_table(collection):
    '''
    Spark seems to ingest data via a big list and goes from there
    so make a dataframe that looks like

    user | app_id | rating (positive)
    '''
    start_time = time.time()

    game_avgs = load_pandas_df("app_means_v3.csv")
    user_avgs = load_pandas_df("user_avgs.csv")

    ##############################################################
    ## Build dictionary to try to speed up lookups of weights ####
    ##############################################################

    # make dictionaries for different weights
    w_s1_dict = load_weights_to_dict("weights_s1", game_avgs)

    w_s2_dict = load_weights_to_dict("weights_s2", game_avgs)

    w_s3_dict = load_weights_to_dict("weights_s3", game_avgs)

    game_avg_dict = load_weights_to_dict("avg_log_min", game_avgs)

    # user_id : avg_playtime_log_m
    user_avg_dict = load_user_avgs_to_dict("avg_playtime_log_m", user_avgs)

    user_lookup_table = {}
    user_reverse_lookup_table = {}

    # get a GameIndexer ready for lookups
    indexer = GameIndexer()

    num_users = collection.find().count()

    # list to hold dictionaries before conversion to df
    data = []

    for idx, user in enumerate(collection.find()):

        # keep track of users with reviews because the rest of
        # the users we have to go back and give 0's to
        #temp_user_list = []

        # if idx > 10:
        #     break

        _user = idx

        user_lookup_table[idx] = user["user"]
        user_reverse_lookup_table[user["user"]] = idx

        # try to keep track of time some
        _t = time.time() - start_time
        _ts = "{:2.2f}".format(_t)[:6]

        # completed in 46s with mod to reduce printing
        # even without the mod check it was 46s so no savings
        #if idx % 100 == 0:
        print "{}s ### {}th user of {} ###### \r".format(_ts, idx, num_users),

        for idy, playtime in enumerate(user["data"]):
            # if idy > 1000:
            #     break

            _appid = indexer.app_id_to_game_index(int(playtime["appid"]))

            #get weighting of app from game_avgs dataframe.
            # get weighting of a certain app

            # pull the weight from the game_avgs dataframe
            #result = game_avgs[game_avgs["app_id"] == _appid]["weights_s1"]
            try:
                weight_s1 = w_s1_dict[_appid]
                weight_s2 = w_s2_dict[_appid]
                weight_s3 = w_s3_dict[_appid]

                #import pdb; pdb.set_trace()
            except Exception, e:
                #print "Item not in dictionary   {}                       {}         {} ".format(e, repr(_appid), type(_appid))
                weight_s1 = 0.0
                weight_s1 = 0.0
                weight_s1 = 0.0

            # if len(result) > 0:
            #     weight = result.values[0]
            #     # if weight >= 0:
            #     #     if weight < 1:
            #     #         print "weight seems good", weight
            #     # elif weight < 0:
            #     #     print "############## Error, seems like it didn't match {}  correctly".format(repr(_appid))
            #     # else:
            #     #     print "##############{}  Error, seems like it didn't match {}  correctly".format(weight, repr(_appid))
            # else:
            #     weight = 0.0

            # if the weight is below zero then the game probably doesn't have any plays
            # (ie no data)

            # potentially modify this to log time because then the
            # distribution is normal

            # Goodnight sweet prince, going to log10 time now
            # _playtime_m = int(review["playtime_forever"])
            _log_playtime_m = int(playtime["playtime_forever"])

            if _log_playtime_m > 1:
                _log_playtime_m = np.log10(_log_playtime_m + 0.0001)
            else:
                _log_playtime_m = 0

            _lpm_b0s1 = _log_playtime_m * weight_s1
            _lpm_b0s2 = _log_playtime_m * weight_s2
            _lpm_b0s3 = _log_playtime_m * weight_s3
            # modify _log_playtime_m by the weighting of the app to
            # compensate for different app biases (ie low user count/high playtime)
            # or very high user counts
            _log_playtime_m

            data.append({
                "appind": _appid,
                "user": _user,
                "lpm_b0_s0": _log_playtime_m,
                "lpm_b0_s1": _lpm_b0s1,
                "lpm_b0_s2": _lpm_b0s2,
                "lpm_b0_s3": _lpm_b0s3
            })