Пример #1
0
    def __init__(self,
                 feed_url,
                 hqbase,
                 hqjob,
                 datadir='data',
                 timeout=20,
                 check_interval=-1):
        self.log = logging.getLogger(
            'gdelt.{0.__name__}'.format(FeedScheduler))
        self.feed_url = feed_url
        self.hqbase = hqbase
        self.hqjob = hqjob
        self.datadir = datadir
        self.timeout = int(timeout)
        self.check_interval = int(check_interval)

        assert os.path.isdir(self.datadir)

        self.deduper = Deduper(self.datadir)
        self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob)

        rfiles = [
            fn for fn in os.listdir(self.datadir)
            if re.match(r'feed-\d{14}$', fn)
        ]
        if rfiles:
            self.log.debug('last=%s', max(rfiles))
            # time.strptime() returns time tuple without timezone. make it
            # UTC with timegm() and gmtime()
            self.last_time = time.gmtime(
                timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S')))
        else:
            self.last_time = None
Пример #2
0
def test_dedup_new_at_bottom(tmpdir):
    tmpdir.join('LAST').write("A\n" "B\n" "C\n" "D\n" "E\n")
    source = ["B", "C", "D", "E", "F"]

    dedup = Deduper(str(tmpdir))

    out = list(dedup.dedup(source))

    assert out == ["F"]