示例#1
0
def RunMLCollector(month, day):
    filename = config.GetSetsFileName(month, day, collection_type)
    sets = config.ReadJSON(filename)

    times = []
    for set_n in sets:
        for time in set_n['runtimes']:
            times.append(time)

    sorted_times = sorted(times)

    for sorted_time in sorted_times:
        for set_n in sets:
            if sorted_time in set_n['runtimes']:
                timer = sorted_time
                timex = datetime.datetime.strptime(timer, '%Y-%m-%d %H:%M:%S')
                if timex > datetime.datetime.now():
                    delta = int(
                        (timex - datetime.datetime.now()).total_seconds())
                    print('Waiting', delta, 'seconds. Next Up', set_n['name'],
                          set_n['call'], timex)
                    sleeper.sleep(delta)

                    # Set API calls
                    if set_n['call'] == 1:
                        collection.FilterStatusByLocation(set_n)
                    else:
                        set_n['call'] = collection.GetUpdatedStatuses(set_n)
                    calltime = datetime.datetime.now()
                    set_n['call_times'].append(calltime)
                    config.WriteJSON(sets, filename)
                else:
                    print('skipping old runtime.')
示例#2
0
def PreviewSchedule(month, day, collection_type):

    sets = config.ReadJSON(config.GetSetsFileName(month, day, collection_type))

    print("\n")
    print("Preview Schedule")
    print("____________________________________________")
    print("Interval Name:", sets[0]['interval_name'])
    print("Interval Type:", sets[0]['interval_type'])
    print("Total Intervals:", sets[0]['total_intervals'])
    print("Interval Number:", sets[0]['interval'])

    times = []
    for set_n in sets:
        for time in set_n['runtimes']:
            times.append(time)

    print('\n')
    print("Number Sets:", len(sets))
    print("Number Calls:", len(sets[0]['runtimes']))
    print("Current Call:", sets[0]['call'])
    print('\n')

    sorted_times = sorted(times)

    call_copy = 1
    for sorted_time in sorted_times:
        for set_n in sets:
            if sorted_time in set_n['runtimes']:
                timer = sorted_time
                timex = datetime.datetime.strptime(timer, '%Y-%m-%d %H:%M:%S')
                print('Set:', set_n['set'], ' Call:', call_copy, 'Runtime:',
                      timer)
                call_copy += 1
示例#3
0
def ExtractTweets(month,
                  days,
                  sets,
                  calls,
                  num_tweets,
                  collection_type,
                  clean=True,
                  useWeights=True,
                  wrapEntities=True):
    # Change days to interval days to pull from multiple months.
    # Determine file combos
    file_combos = []
    for i in days:
        for j in sets:
            for k in calls:
                file_combo = (i, j, k)
                file_combos.append(file_combo)

    # Get statuses for all combos
    statusCollection = []
    for file_combo in file_combos:
        day_num = file_combo[0]
        set_name = file_combo[1]
        call_num = file_combo[2]
        filename = config.GetTweetFileName(month, day_num, set_name, call_num,
                                           collection_type)
        if filename != '':
            # Add in sample data
            status = config.ReadJSON(filename)
            filename = config.GetSetsFileName(month, day_num, collection_type)
            sets = config.ReadJSON(filename)
            # Validate numtweets
            if num_tweets <= 0:
                num_tweets = 10
            elif num_tweets > len(status):
                num_tweets = len(status)
            # Add each set attribute
            status = status[:num_tweets]
            for tweet in status:
                tweet['day'] = day_num
                tweet['set'] = set_name
                tweet['call'] = call_num
                offset = datetime.timedelta(hours=5)
                time = sets[set_name - 1]['call_times'][call_num - 1]
                tweet['calltime'] = time
                if useWeights == True:
                    influence_score = (
                        tweet['favorite_count'] *
                        config.GetWeights()['favorite_count']) + (
                            tweet['retweet_count'] *
                            config.GetWeights()['retweet_count']) + (
                                tweet['user']['followers_count'] *
                                config.GetWeights()['followers_count']) + (
                                    tweet['user']['listed_count'] *
                                    config.GetWeights()['listed_count'])
                else:
                    influence_score = tweet['favorite_count'] + tweet[
                        'retweet_count']

                tweet['influence_score'] = influence_score

            statusCollection.append(status)

    # Load each into dict
    count = 0
    status_dict = {}
    for statuses in statusCollection:
        for status in statuses:
            status_dict[count] = status
            count += 1

    # Get sorted df from dicts
    tweets = pd.DataFrame(status_dict).T
    # tweets = tweets.sort_values(by=['influence_score'],ascending=False)

    if clean:
        tweets = CleanTweets(tweets)

    if wrapEntities:
        tweets = WrapEntities(tweets)

    return tweets
示例#4
0
def MakeSets(year, month, day, collection_interval, interval_identifier,
             user_schedule, collection_type, boundaries):

    for interval in range(1, collection_interval + 1):

        # Get start times from schedule
        num_sets = len(user_schedule.columns)
        num_calls = len(user_schedule.index)
        cols = list(user_schedule.columns)
        all_times = []
        for col in cols:
            set_runtimes = user_schedule[col]
            for set_runtime in set_runtimes:
                # Create starting time for day. All times midnight to start time get called on following day but recorded for set start day.
                a = datetime.time(hour=6)
                if set_runtime < a:
                    StartDate = datetime.datetime(year, month, day + 1)
                else:
                    StartDate = datetime.datetime(year, month, day)
                runtime = datetime.datetime.combine(StartDate, set_runtime)
                all_times.append(runtime)

        # Create sets obj
        sets = []
        j = 0
        for i in range(1, num_sets + 1):
            k = j + num_calls
            name = 'set_' + str(i)
            runs = all_times[j:k]
            day = day
            set_x = {
                'interval_name': interval_identifier,
                'total_intervals': collection_interval,
                'collection_type': collection_type,
                'interval': interval,
                'year': year,
                'month': month,
                'day': day,
                'name': name,
                'set': i,
                'call': 1,
                'runtimes': runs,
                'call_times': [],
                'boundaries': boundaries
            }
            sets.append(set_x)
            j += num_calls

        # Add month and day folders if needed
        APP_ROOT = os.path.dirname(os.path.abspath(__file__))
        tweets_folder = config.GetTweetsFolder().replace('../', '')
        MONTH_ROOT = APP_ROOT.replace(
            'scripts',
            tweets_folder + '/' + collection_type + '/M' + str(month))
        if not os.path.isdir(MONTH_ROOT):
            os.mkdir(MONTH_ROOT)

        DAY_ROOT = MONTH_ROOT + "/D" + str(day)
        if not os.path.isdir(DAY_ROOT):
            os.mkdir(DAY_ROOT)

        config.WriteJSON(sets,
                         config.GetSetsFileName(month, day, collection_type))

        # increment day. if day is in next month, increment month, set day to 1
        # Add year by EOY!
        day += 1
        try:
            datetime.datetime(year, month, day)
        except ValueError as e:
            if str(e) == "day is out of range for month":
                month += 1
                day = 1
            try:
                datetime.datetime(year, month, day)
            except ValueError as e:
                if str(e) == "month must be in 1..12":
                    month = 1
示例#5
0
def RunNWCollector(month, day, localPlaces):
    import config

    filename = config.GetSetsFileName(month, day, collection_type)
    sets = config.ReadJSON(filename)

    # localPlaces = ['Erie, PA','Wesleyville, PA','Harborcreek, PA','Lawrence Park, PA']

    times = []
    for set_n in sets:
        for time in set_n['runtimes']:
            times.append(time)

    sorted_times = sorted(times)

    for sorted_time in sorted_times:
        for set_n in sets:
            if sorted_time in set_n['runtimes']:
                timer = sorted_time
                timex = datetime.datetime.strptime(timer, '%Y-%m-%d %H:%M:%S')
                if timex > datetime.datetime.now():
                    delta = int(
                        (timex - datetime.datetime.now()).total_seconds())
                    print('Waiting', delta, 'seconds. Next Up', set_n['name'],
                          set_n['call'], timex)
                    sleeper.sleep(delta)

                    # Get tweets
                    collection.FilterStatusByLocation(set_n)

                    # Update sets
                    calltime = datetime.datetime.now()
                    set_n['call_times'].append(calltime)
                    config.WriteJSON(sets, filename)

                    # Check each user, if in Erie write to users and get friends/followers
                    tweets = config.ReadJSON(
                        config.GetTweetFileName(set_n['month'], set_n['day'],
                                                set_n['set'], set_n['call']))
                    newUsers = []
                    for tweet in tweets:
                        influence_score = 0
                        if tweet['place'] and tweet['place'][
                                'full_name'] in localPlaces:
                            influence_score = (
                                tweet['user']['followers_count'] *
                                config.GetWeights()['followers_count']) + (
                                    tweet['user']['listed_count'] *
                                    config.GetWeights()['listed_count'])
                            tweet['user']['influence_score'] = influence_score
                            newUsers.append(tweet['user'])

                    # Write each user to file, get friends, followers
                    for user in newUsers:
                        filename = config.GetUserFileName(user['id_str'])
                        config = Path(filename)
                        if config.is_file():
                            # Update this - need the most current version but not if user is in this set
                            print('User', user['id_str'],
                                  'already exists. Skipping for now.')
                        else:
                            print('Writing user', user['id_str'])
                            config.WriteJSON(user, filename)
                            collection.GetFriends(user['id_str'])
                            collection.GetFollowers(user['id_str'])
                else:
                    print('skipping old runtime.')