def testExtractDates_usingPattern(): # Standard pattern s = Scraper('data/%Y/%m/%d/fits/swap/swap_00174_fd_%Y%m%d_%H%M%S.fts.gz') testURL = 'data/2014/05/14/fits/swap/swap_00174_fd_20140514_200135.fts.gz' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL # Not-full repeated pattern s = Scraper('data/%Y/fits/swap/swap_00174_fd_%Y%m%d_%H%M%S.fts.gz') testURL = 'data/2014/fits/swap/swap_00174_fd_20140514_200135.fts.gz' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def testExtractDates_usingPattern(): # Standard pattern s = Scraper('data/%Y/%m/%d/fits/swap/swap_00174_fd_%Y%m%d_%H%M%S.fts.gz') testURL = 'data/2014/05/14/fits/swap/swap_00174_fd_20140514_200135.fts.gz' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL # Not-full repeated pattern s = Scraper('data/%Y/fits/swap/swap_00174_fd_%Y%m%d_%H%M%S.fts.gz') testURL = 'data/2014/fits/swap/swap_00174_fd_20140514_200135.fts.gz' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def testExtractDates_notSeparators_andSimilar(): s = Scraper('data/%Y/Jun%b%d_%H%M%S') testURL = 'data/2014/JunJun14_200135' timeURL = parse_time((2014, 6, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL testURL = 'data/2014/JunMay14_200135' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL # and testing with the month afterwards s = Scraper('data/%Y/%dJun%b_%H%M%S') testURL = 'data/2014/14JunJun_200135' timeURL = parse_time((2014, 6, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def testExtractDates_notSeparators_andSimilar(): s = Scraper('data/%Y/Jun%b%d_%H%M%S') testURL = 'data/2014/JunJun14_200135' timeURL = parse_time((2014, 6, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL testURL = 'data/2014/JunMay14_200135' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL # and testing with the month afterwards s = Scraper('data/%Y/%dJun%b_%H%M%S') testURL = 'data/2014/14JunJun_200135' timeURL = parse_time((2014, 6, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def _get_time_for_url(self, urls): eve = Scraper(BASEURL) times = list() for url in urls: t0 = eve._extractDateURL(url) # hard coded full day as that's the normal. times.append(TimeRange(t0, t0 + datetime.timedelta(days=1))) return times
def _get_time_for_url(self, urls): eve = Scraper(BASEURL) times = list() for url in urls: t0 = eve._extractDateURL(url) # hard coded full day as that's the normal. times.append(TimeRange(t0, t0 + datetime.timedelta(days=1))) return times
def _get_time_for_url(self, urls): freq = urls[0].split('/')[-1][0:3] # extract the frequency label crawler = Scraper(BASEURL, freq=freq) times = list() for url in urls: t0 = crawler._extractDateURL(url) # hard coded full day as that's the normal. times.append(TimeRange(t0, t0 + TimeDelta(1*u.day))) return times
def testExtractDates_notSeparators(): s = Scraper('data/%Y/%m/swap%m%d_%H%M%S') testURL = 'data/2014/05/swap0514_200135' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def testExtractDates_notSeparators(): s = Scraper('data/%Y/%m/swap%m%d_%H%M%S') testURL = 'data/2014/05/swap0514_200135' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def _get_url_for_timerange(self, timerange): """ Returns a list of urls corresponding to a given time-range. """ result = list() # Validate time range srs generated daily since 1996 cur_year = Time.now().datetime.year req_start_year = timerange.start.datetime.year req_end_year = timerange.end.datetime.year # Return early if both start and end are less than or greater than limits if req_start_year <= req_end_year < self.MIN_YEAR \ or req_end_year >= req_start_year > cur_year: return result # No point in searching below the min or above max years start_year = max(req_start_year, self.MIN_YEAR) end_year = min(req_end_year, cur_year) # Search for tarballs for all years in the query tarball_timerange = TimeRange(f'{start_year}-01-01', f'{end_year}-12-31 23:59:59.999') tarball_urls = dict() tarball_scraper = Scraper(self.BASE_URL + '%Y/%Y_SRS.tar.gz') tarballs = tarball_scraper.filelist(tarball_timerange) max_tarball_year = None for tb_url in tarballs: date = tarball_scraper._extractDateURL(tb_url) year = date.to_datetime().year max_tarball_year = year tarball_urls[year] = tb_url log.debug('SRS tarball found for year %d', year) # Create a new time range for the times not covered by tarballs, have to assume tarballs # cover a year, and look for individual srs file for this time range. srs_urls = dict() min_file_year = max_tarball_year if max_tarball_year else start_year min_file_date = (datetime(max_tarball_year, 12, 31, 23, 59, 59) if max_tarball_year else datetime( start_year, 1, 1, 0, 0, 0)) max_file_date = min(timerange.end.datetime, Time.now().datetime) if min_file_date < max_file_date: file_timerange = TimeRange(f'{min_file_year}-01-01', max_file_date) srsfile_scraper = Scraper(self.BASE_URL + '%Y/SRS/%Y%m%dSRS.txt') srsfiles = srsfile_scraper.filelist(file_timerange) for srs_url in srsfiles: date = srsfile_scraper._extractDateURL(srs_url) srs_urls[(date.datetime.year, date.datetime.month, date.datetime.day)] = srs_url log.debug('SRS file found for year %d', date) # Now iterate over all days and if the day is in a year we have a tarball for or a day there # is a individual srs file add to the result with corresponding extdict for day in timerange.get_dates(): day_ymd = (int(day.strftime('%Y')), int(day.strftime('%m')), int(day.strftime('%d'))) extdict = { 'year': day_ymd[0], 'month': day_ymd[1], 'day': day_ymd[2] } if self.MIN_YEAR <= day_ymd[0] <= cur_year: if day_ymd[0] in tarball_urls.keys(): result.append((extdict, tarball_urls[day_ymd[0]])) elif day_ymd in srs_urls.keys(): result.append((extdict, srs_urls[day_ymd])) return result