예제 #1
0
def chunk_date_range(start_date: DateTime, interval=1) -> Iterable[Mapping[str, any]]:
    """
    Returns a list of the beginning and ending timetsamps of each day between the start date and now.
    The return value is a list of dicts {'oldest': float, 'latest': float} which can be used directly with the Slack API
    """
    intervals = []
    now = pendulum.now()
    # Each stream_slice contains the beginning and ending timestamp for a 24 hour period
    while start_date <= now:
        end = start_date.add(days=interval)
        intervals.append({"oldest": start_date.timestamp(), "latest": end.timestamp()})
        start_date = start_date.add(days=1)

    return intervals
예제 #2
0
def round_to_closest_hour(time_data: pendulum.DateTime) -> pendulum.DateTime:
    if time_data.minute == 0:
        return time_data
    elif time_data.minute > 30:
        return time_data.add(minutes=60 - time_data.minute)
    else:
        return time_data.subtract(minutes=time_data.minute)
예제 #3
0
 async def populate_paths(self, manifest_cache, entry_id: EntryID,
                          early: DateTime, late: DateTime):
     # TODO : Use future manifest source field to follow files and directories
     async with trio.open_service_nursery() as child_nursery:
         child_nursery.start_soon(self.populate_source_path, manifest_cache,
                                  entry_id, early.add(microseconds=-1))
         child_nursery.start_soon(self.populate_destination_path,
                                  manifest_cache, entry_id, late)
         child_nursery.start_soon(self.populate_current_path,
                                  manifest_cache, entry_id, early)
예제 #4
0
    def _getIntradayPage(self, requestedDate: pendulum.DateTime, siteId: str):
        # Weirdly, to get a day's data, you request the next day in the API... Yep
        dateString = requestedDate.add(days=1).format('YYYYMMDD')
        url = f'{_host}/intraday.jsp?id=&sid={siteId}&dt={dateString}&gs=0&m=0'
        # self._delay()
        if not self.session:
            r = requests.get(url)
        else:
            r = self.session.get(url)

        return r
예제 #5
0
def chunk_date_range(start_date: DateTime) -> Iterable[Mapping[str, any]]:
    """
    Returns a list of each day between the start date and now. Ignore weekends since exchanges don't run on weekends.
    The return value is a list of dicts {'date': date_string}.
    """
    days = []
    now = pendulum.now()
    while start_date < now:
        day_of_week = start_date.day_of_week
        if day_of_week != pendulum.SATURDAY & day_of_week != pendulum.SUNDAY:
            days.append({"date": start_date.to_date_string()})
        start_date = start_date.add(days=1)

    return days
예제 #6
0
def get_offsets(
    subreddit: str,
    after: pendulum.DateTime,
    before: pendulum.DateTime,
    sample_size: int,
    PUSHSHIFT_LIMIT: int,
) -> list[pendulum.DateTime]:
    """For sampling, return a set of hourly offsets, beginning near
    after, that should not overlap"""

    duration = before - after
    info(f"{duration.in_days()=}")
    info(f"{duration.in_hours()=}")
    info(f"{duration.in_weeks()=}")
    results_total = get_pushshift_total(subreddit, after, before)
    results_per_hour = math.ceil(results_total / duration.in_hours())
    info(f"{results_per_hour=} on average")

    info(f"{PUSHSHIFT_LIMIT=}")
    info(f"{sample_size=}")
    queries_total = math.ceil(sample_size / PUSHSHIFT_LIMIT)
    info(f"{queries_total=}")
    info(f"{range(duration.in_hours())=}")

    SEEDS_TO_TRY = 300
    seed = int(after.timestamp())
    for seed_counter in range(SEEDS_TO_TRY):
        seed += seed_counter  # increment seed
        warning(f"attempt {seed_counter} to find non-overlapping offsets")
        offsets = get_cacheable_randos(duration.in_hours(), queries_total, seed)
        if is_overlapping(offsets, PUSHSHIFT_LIMIT, results_per_hour):
            critical(f"  seed attempt {seed_counter} failed")
            continue
        else:
            break
    else:
        print(
            f"I exhausted random sets of offsets at {SEEDS_TO_TRY=}"
            f"Quitting because I'm too likely to pull overlapping results"
        )
        raise RuntimeError

    offsets_as_datetime = []
    for offset_as_hour in offsets:
        offset_as_datetime = after.add(hours=offset_as_hour)
        offsets_as_datetime.append(offset_as_datetime)
    info(f"{len(offsets)=}")
    return offsets_as_datetime
예제 #7
0
from pathlib import Path

siteId = '57775'

directory = Path('directory'+siteId)
if not directory.exists():
    directory.mkdir()

# if not logged in then this will only work for th last 14 days
testDate = DateTime(2019, 2, 1)

pvo = PVOutput()
pvo.login('username', 'password')

for idx in range(1,140):
    dateString = testDate.to_date_string()
    print('creating file ', dateString)
    try:
        data = pvo.getIntradayData(testDate, siteId)
    except NameError as e:
        print(e)
        print("missing data for " + dateString)
        testDate = testDate.add(days=1)
        continue

    with open(directory.as_posix() + f'/{dateString}.csv','w', newline='') as csvFile:
        dataFile = csv.writer(csvFile)
        dataFile.writerow(data.headers)
        dataFile.writerows(data.data)

    testDate = testDate.add(days=1)