def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [ fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn) ] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime( timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None
def test_dedup_new_at_bottom(tmpdir): tmpdir.join('LAST').write("A\n" "B\n" "C\n" "D\n" "E\n") source = ["B", "C", "D", "E", "F"] dedup = Deduper(str(tmpdir)) out = list(dedup.dedup(source)) assert out == ["F"]