def extract_data(path): prefix = ensure_path("ted/xml/") tf = tarfile.open(path, "r:gz") log.info("Extracing %s", path) for member in tf.getmembers(): mpath = os.path.join(prefix, member.name) if not os.path.exists(mpath): print "foo", member tf.extract(member, path=prefix)
def make_iso_list(): with open(ensure_path('ted/iso_list.txt'), 'w') as fh: for year in range(2010, datetime.utcnow().year+1): log.info("Listing monthly TED ISOs for %s", year) res = requests.get(URL % year, auth=AUTH) doc = html.fromstring(res.content) urls = [a.get('href') for a in doc.findall('.//a') if 'monthly.iso' in a.get('href')] for url in urls: url = url.replace('//', '//' + AUTH[0] + ':' + AUTH[1] + '@') fh.write(url + '\r\n')
def download_by_id(session, bulk_id): dest_path = ensure_path("ted/archives/%s.tgz" % bulk_id) log.info("Loading: %s" % dest_path) if os.path.exists(dest_path): log.info("Skip: exists") return dest_path url = "http://ted.europa.eu/TED/misc/bulkDownloadExport.do?dlTedExportojsId=%s" url = url % bulk_id data = {"action": "dlTedExport"} res = session.post(url, data=data, allow_redirects=True) if "html" in res.headers.get("content-type"): return False with open(dest_path, "wb") as fh: fh.write(res.content) log.info("Downloaded: %s" % dest_path) return dest_path