def run(self): common.shell_cmd('mkdir -p %s', self.local_dir) soup = BeautifulSoup(urlopen(CAERS_DOWNLOAD_PAGE_URL).read(), 'lxml') for a in soup.find_all(title=re.compile('CAERS ASCII.*')): if 'Download CAERS ASCII' in re.sub(r'\s', ' ', a.text): fileURL = urljoin('https://www.fda.gov', a['href']) common.download(fileURL, join(self.output().path, a.attrs['title']+'.csv'))
def run(self): logging.basicConfig(level=logging.INFO) output_dir = TOBACCO_RAW_DIR os.system('mkdir -p %s' % output_dir) # Download all csv source files (current year and archived years). soup = BeautifulSoup( urlopen(TOBACCO_PROBLEM_DOWNLOAD_PAGE).read(), 'lxml') for a in soup.find_all(title=re.compile('\d{4}.*tppr', re.IGNORECASE)): file_name = a['title'] if '.csv' in a['title'] else (a['title'] + '.csv') common.download(urljoin('https://www.fda.gov', a['href']), join(output_dir, file_name)) # Combine CSV files into a single json. all_csv_files = [ i for i in glob.glob((output_dir + '/*.{}').format('csv')) ] logging.info("Reading csv files: %s", (all_csv_files)) os.system('mkdir -p %s' % dirname(self.output().path)) df = pd.concat( pd.read_csv(f, encoding='cp1252', skiprows=3) for f in all_csv_files) df.to_json(self.output().path, orient='records') with open(self.output().path, "w") as f: for row in df.iterrows(): row[1].to_json(f) f.write("\n")
def run(self): fileURL = None soup = BeautifulSoup(urlopen(SUBSTANCE_DATA_DOWNLOAD_PAGE_URL).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.gsrs')): if 'Full Public Data Dump' in a.text: fileURL = urljoin(GINAS_ROOT_URL, a['href']) common.download(fileURL, self.output().path)
def run(self): common.shell_cmd('mkdir -p %s', self.local_dir) soup = BeautifulSoup(urllib2.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if '_human_' in a.text: try: common.download(a['href'], join(self.local_dir, a['href'].split('/')[-1])) except ProcessException as e: logging.error("Could not download a DailyMed SPL archive: {0}: {1}".format(a['href'], e))
def run(self): soup = BeautifulSoup( urllib2.urlopen(CLEARED_DEVICE_URL).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if a.text.startswith('PMN') and a.text != 'PMNLSTMN.ZIP': fileURL = a['href'] common.download( fileURL, join(self.output().path, a['href'].split('/')[-1]))
def run(self): zip_urls = [] soup = BeautifulSoup(urlopen(DEVICE_REG_PAGE).read()) for a in soup.find_all(href=re.compile('.*.zip')): zip_urls.append(a['href']) if not zip_urls: logging.info('No Registration Zip Files Found At %s' % DEVICE_REG_PAGE) for zip_url in zip_urls: filename = zip_url.split('/')[-1] common.download(zip_url, join(self.output().path, filename))
def _run(self): zip_urls = [] soup = BeautifulSoup(urllib2.urlopen(DEVICE_DOWNLOAD_PAGE).read()) for a in soup.find_all(href=re.compile('.*.zip')): zip_urls.append(a['href']) if not zip_urls: logging.fatal('No MAUDE Zip Files Found At %s' % DEVICE_CLASS_DOWNLOAD) for zip_url in zip_urls: filename = zip_url.split('/')[-1] common.download(zip_url, join(self.output().path, filename))
def run(self): zip_urls = [] soup = BeautifulSoup(urllib2.urlopen(DEVICE_REG_PAGE).read()) for a in soup.find_all(href=re.compile(".*.zip")): zip_urls.append(a["href"]) if not zip_urls: logging.info("No Registration Zip Files Found At %s" % DEVICE_REG_PAGE) for zip_url in zip_urls: filename = zip_url.split("/")[-1] common.download(zip_url, join(self.output().path, filename))
def run(self): zip_url = None soup = BeautifulSoup(urllib2.urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if 'NDC Database File' in a.text: zip_url = urlparse.urljoin('http://www.fda.gov', a['href']) break if not zip_url: logging.fatal('NDC database file not found!') common.download(zip_url, self.output().path)
def run(self): # TODO(hansnelsen): copied from the FAERS pipeline, consider refactoring # into a generalized approach zip_urls = [] soup = BeautifulSoup(urllib2.urlopen(DEVICE_DOWNLOAD_PAGE).read()) for a in soup.find_all(href=re.compile('.*.zip')): zip_urls.append(a['href']) if not zip_urls: logging.fatal('No MAUDE Zip Files Found At %s' % DEVICE_CLASS_DOWNLOAD) for zip_url in zip_urls: filename = zip_url.split('/')[-1] common.download(zip_url, join(self.output().path, filename))
def _download_with_retry(self, url, target_name): if os.path.exists(target_name): return for i in range(10): try: logging.info('Downloading: ' + url) common.download(url, target_name) subprocess.check_call('unzip -t %s' % target_name, shell=True) return except: logging.info( 'Problem while unzipping[download URL:%s, zip file:%s], retrying...', url, target_name) logging.fatal( 'Zip File: %s from URL :%s is not valid, stop all processing', target_name, url)
def run(self): finished_ndc_url = None unfinished_ndc_url = None soup = BeautifulSoup(urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if 'NDC Database File' in a.text and 'text' in a['href']: finished_ndc_url = urljoin('https://www.fda.gov', a['href']) if 'NDC Unfinished' in a.text and 'unfinished.zip' in a['href']: unfinished_ndc_url = urljoin('https://www.fda.gov', a['href']) if not finished_ndc_url: logging.fatal('NDC finished database file not found!') if not unfinished_ndc_url: logging.fatal('NDC unfinished drugs database file not found!') common.download(finished_ndc_url, join(RAW_DIR, 'finished.zip')) common.download(unfinished_ndc_url, join(RAW_DIR, 'unfinished.zip'))
def run(self): logging.basicConfig(level=logging.INFO) zip_filename = config.data_dir('nsde/raw/nsde.zip') output_dir = config.data_dir('nsde/raw') os.system('mkdir -p %s' % output_dir) common.download(NSDE_DOWNLOAD, zip_filename) os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) csv_file = join(output_dir, self.csv_file_name) logging.info("Reading csv file: %s", (csv_file)) os.system('mkdir -p %s' % dirname(self.output().path)) df = pd.read_csv(csv_file, encoding='utf-8-sig') df.to_json(self.output().path, orient='records') with open(self.output().path, "w") as f: for row in df.iterrows(): row[1].to_json(f) f.write("\n")
def run(self): common.download(self.url, os.path.join(self.local_dir, 'registration_listing.txt'))
def run(self): output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1]) common.download(DEVICE_PMA_ZIP, output_filename)
def run(self): for url in SPL_DOWNLOADS: filename = join(self.output().path, url.split('/')[-1]) common.download(url, filename)
def run(self): output_dir = self.output().path for zip_url in CLEARED_DEV_ZIPS: output_filename = join(output_dir, zip_url.split('/')[-1]) common.download(zip_url, output_filename)
def run(self): output_dir = dirname(self.output().path) zip_filename = join(output_dir, 'nsde.zip') common.download(NSDE_DOWNLOAD, zip_filename) os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) os.rename(glob.glob(join(output_dir, '*.csv'))[0], self.output().path)
def run(self): common.download(PHARM_CLASS_DOWNLOAD, self.output().path)
def run(self): common.download(RXNORM_DOWNLOAD, self.output().path)
def run(self): common.download(DOWNLOAD_FILE, RAW_DATA_FILE)