key = iter(self.bucket.list(prefix=prefix)).next() urls = [key.generate_url(seconds_good_for,force_http=True)] return self.input_stream, urls def segment_between(self, start,end): prefix = self._key_for_datetime(start) urls = [] limit = self.rule._params.get('maxinput', float('inf')) for key in self.bucket.list(prefix=prefix): dt = self._datetime_for_key(key) if dt > end: break else: limit -= 1 if limit < 0: break if key.size > 0: urls.append(key.generate_url(seconds_good_for,force_http=True)) return urls datasources.set_source_for_url(CommonCrawlSource, 's3://aws-publicdatasets/common-crawl/crawl-002/') datasources.set_source_for_url(CommonCrawlSource, 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/')
if dt is None: continue if dt > end: break else: limit -= 1 if limit < 0: break if key.size > 0: urls.append(self.generate_url(key, force_http=True)) return urls datasources.set_source_for_url( CommonCrawlSource, 's3://aws-publicdatasets/common-crawl/crawl-002/') datasources.set_source_for_url( CommonCrawlSource, 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/') import calendar class CommonCrawl2012Source(CommonCrawlSource): @property def segments(self): #Q: How long to source objects live? if not hasattr(self, '_segments'): key = self.bucket.lookup( 'common-crawl/parse-output/valid_segments.txt') self._segments = sorted([
if dt is None: continue if dt > end: break else: limit -= 1 if limit < 0: break if key.size > 0: urls.append(self.generate_url(key,force_http=True)) return urls datasources.set_source_for_url(CommonCrawlSource, 's3://aws-publicdatasets/common-crawl/crawl-002/') datasources.set_source_for_url(CommonCrawlSource, 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/') import calendar class CommonCrawl2012Source(CommonCrawlSource): @property def segments(self): #Q: How long to source objects live? if not hasattr(self,'_segments'): key = self.bucket.lookup('common-crawl/parse-output/valid_segments.txt') self._segments = sorted([datetime.datetime.utcfromtimestamp(int(k)/1000.) for k in key.read().split('\n') if k]) return self._segments def earliest_record_time(self): # Grab and parse the first key