def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [ fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn) ] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime( timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None
def test_dedup_new_at_bottom(tmpdir): tmpdir.join("LAST").write("A\n" "B\n" "C\n" "D\n" "E\n") source = ["B", "C", "D", "E", "F"] dedup = Deduper(str(tmpdir)) out = list(dedup.dedup(source)) assert out == ["F"]
def test_dedup_new_at_bottom(tmpdir): tmpdir.join('LAST').write("A\n" "B\n" "C\n" "D\n" "E\n") source = ["B", "C", "D", "E", "F"] dedup = Deduper(str(tmpdir)) out = list(dedup.dedup(source)) assert out == ["F"]
def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn)] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime(timegm( time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None
class FeedScheduler(object): def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn)] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime(timegm( time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None def process(self): while True: t = time.time() try: self.process1() except KeyboardInterrupt as ex: raise except Exception as ex: self.log.error('process1 failed', exc_info=1) if self.check_interval < 0: self.log.debug('exiting because check_interval < 0') break if test_mode: self.log.debug('exiting because test_mode=True') break dt = t + self.check_interval - time.time() if dt >= 1.0: self.log.debug('sleeping %ds until next cycle', int(dt)) time.sleep(dt) def process1(self): # file name is in UTC. rid = time.strftime('%Y%m%d%H%M%S', time.gmtime()) rfile = os.path.join(self.datadir, 'feed-{}'.format(rid)) try: req = urllib2.Request(self.feed_url) if self.last_time: self.log.debug('last_time=%s', httpdate(self.last_time)) req.add_header('If-Modified-Since', httpdate(self.last_time)) f = urllib2.urlopen(req, timeout=self.timeout) try: with open(rfile, 'wb') as w: while True: d = f.read(16*1024) if not d: break w.write(d) self.log.info('downloaded %d bytes in %s', w.tell(), rfile) except KeyboardInterrupt as ex: if os.path.exists(rfile): os.remove(rfile) raise except urllib2.HTTPError as ex: if ex.code == 304: # Not Modified self.log.debug('feed %s not modified since %s', self.feed_url, httpdate(self.last_time)) return self.log.warn('%s %s %s', self.feed_url, ex.code, ex.reason) return except (urllib2.URLError, socket.error) as ex: self.log.warn('%s %s', self.feed_url, ex) return self.last_time = time.gmtime() urlcount = 0 slfile = os.path.join(self.datadir, 'sche-{}'.format(rid)) with open(slfile, 'wb') as sl: with open(rfile, 'rb') as f: reader = FeedReader(f) for urls in batchup(crawluri(self.deduper.dedup(reader)), 500): self.log.debug('submitting %s URLs...', len(urls)) if not test_mode: self.hqclient.put(urls) for curl in urls: sl.write(curl['u']) sl.write('\n') urlcount += len(urls) self.log.info('submitted total %s URLs (see %s)', urlcount, os.path.basename(slfile)) self.deduper.step()
class FeedScheduler(object): def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [ fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn) ] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime( timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None def process(self): while True: t = time.time() try: self.process1() except KeyboardInterrupt as ex: raise except Exception as ex: self.log.error('process1 failed', exc_info=1) if self.check_interval < 0: self.log.debug('exiting because check_interval < 0') break if test_mode: self.log.debug('exiting because test_mode=True') break dt = t + self.check_interval - time.time() if dt >= 1.0: self.log.debug('sleeping %ds until next cycle', int(dt)) time.sleep(dt) def process1(self): # file name is in UTC. rid = time.strftime('%Y%m%d%H%M%S', time.gmtime()) rfile = os.path.join(self.datadir, 'feed-{}'.format(rid)) try: req = urllib2.Request(self.feed_url) if self.last_time: self.log.debug('last_time=%s', httpdate(self.last_time)) req.add_header('If-Modified-Since', httpdate(self.last_time)) f = urllib2.urlopen(req, timeout=self.timeout) try: with open(rfile, 'wb') as w: while True: d = f.read(16 * 1024) if not d: break w.write(d) self.log.info('downloaded %d bytes in %s', w.tell(), rfile) except KeyboardInterrupt as ex: if os.path.exists(rfile): os.remove(rfile) raise except urllib2.HTTPError as ex: if ex.code == 304: # Not Modified self.log.debug('feed %s not modified since %s', self.feed_url, httpdate(self.last_time)) return self.log.warn('%s %s %s', self.feed_url, ex.code, ex.reason) return except (urllib2.URLError, socket.error) as ex: self.log.warn('%s %s', self.feed_url, ex) return self.last_time = time.gmtime() urlcount = 0 slfile = os.path.join(self.datadir, 'sche-{}'.format(rid)) with open(slfile, 'wb') as sl: with open(rfile, 'rb') as f: reader = FeedReader(f) for urls in batchup(crawluri(self.deduper.dedup(reader)), 500): self.log.debug('submitting %s URLs...', len(urls)) if not test_mode: self.hqclient.put(urls) for curl in urls: sl.write(curl['u']) sl.write('\n') urlcount += len(urls) self.log.info('submitted total %s URLs (see %s)', urlcount, os.path.basename(slfile)) self.deduper.step()