Exemplo n.º 1
0
    def test_emails(self):
        meta = Metadata()
        meta.add_email('*****@*****.**')
        assert len(meta.emails) == 1, meta.urls
        assert len(meta.domains) == 1, meta.domain
        assert meta.domains[0] == 'pudo.org', meta.domains

        meta = Metadata()
        meta.add_email('not-an-email')
        assert len(meta.emails) == 0, meta.emails
Exemplo n.º 2
0
    def test_dates(self):
        meta = Metadata()
        meta.add_date('today')
        assert len(meta.dates) == 0, meta.dates

        meta = Metadata()
        meta.add_date('2001-01-20')
        assert len(meta.dates) == 1, meta.dates
        meta.add_date('2001-01-20')
        assert len(meta.dates) == 1, meta.dates
        meta.add_date('2002-01-20')
        assert len(meta.dates) == 2, meta.dates
Exemplo n.º 3
0
 def test_normalize(self):
     for number in PHONE_NUMBERS:
         meta = Metadata(data={'countries': ['de']})
         analyzer = PhoneNumberAnalyzer(None, meta)
         analyzer.prepare()
         analyzer.on_text(number)
         analyzer.finalize()
Exemplo n.º 4
0
    def test_urls(self):
        meta = Metadata()
        meta.urls = ['http://google.com']
        assert len(meta.urls) == 1, meta.urls
        assert len(meta.domains) == 1, meta.domain
        assert meta.domains[0] == 'google.com', meta.domains

        meta.add_url('http://')
        assert len(meta.urls) == 1, meta.urls

        meta.add_url('http://www.google.com/xxx')
        assert len(meta.urls) == 2, meta.urls
        assert len(meta.domains) == 1, meta.domain
Exemplo n.º 5
0
def ingest_url(source_id, metadata, url):
    meta = Metadata(data=metadata)
    try:
        fh, tmp_path = mkstemp()
        os.close(fh)
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 400:
            raise Exception("HTTP Error %r: %r" % (url, res.status_code))
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(source_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, source_id, ex)
Exemplo n.º 6
0
def ingest(source_id, metadata):
    meta = Metadata(data=metadata)
    Ingestor.dispatch(source_id, meta)
Exemplo n.º 7
0
 def make_meta(self, data={}):
     data = json.loads(json.dumps(data))
     data['crawler'] = self.get_id()
     data['crawler_run'] = self.crawler_run
     return Metadata(data=data)
Exemplo n.º 8
0
def ingest(collection_id, metadata):
    meta = Metadata(data=metadata)
    Ingestor.dispatch(collection_id, meta)
Exemplo n.º 9
0
 def meta(self):
     self._meta = self._meta or {}
     self._meta['content_hash'] = self.content_hash
     self._meta['foreign_id'] = self.foreign_id
     return Metadata(data=self._meta or {})