def run(self): ndc_file = self.input()[0].path pharma_class_dir = self.input()[1].path unii_file = self.input()[2].path output_file = self.output().path common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path)) unii_harmonization.harmonize_unii(output_file, ndc_file, unii_file, pharma_class_dir)
def run(self): output_dir = self.output().path common.shell_cmd_quiet('mkdir -p %s', output_dir) input_dir = self.local_dir for zip_filename in glob.glob(input_dir + '/*.zip'): common.shell_cmd_quiet('unzip -ouq "%s" -d %s', zip_filename, output_dir)
def run(self): zip_filename = self.input().path output_filename = self.output().path common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path)) cmd = 'unzip -p %(zip_filename)s rxnorm_mappings.txt > \ %(output_filename)s' % locals() common.shell_cmd_quiet(cmd)
def _run(self): sync_path = FILES_DIR target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str) s3_cmd = [ 'aws', '--profile', config.aws_profile(), 's3', 'sync', sync_path, target_bucket, '--exclude "*"', '--include "*.zip"', '--include "*schema.json"' ] common.shell_cmd_quiet(' '.join(s3_cmd))
def map(self, zip_file, value, output): cmd = 'zipinfo -1 %(zip_file)s' % locals() xml_file_name = None zip_contents = common.shell_cmd_quiet(cmd) xml_match = re.search('^([0-9a-f-]{36})\.xml$', zip_contents, re.I | re.M) if (xml_match): xml_file_name = xml_match.group() spl_dir_name = os.path.join(self.output().path, xml_match.group(1)) os.system('mkdir -p "%s"' % spl_dir_name) common.shell_cmd_quiet('unzip -oq %(zip_file)s -d %(spl_dir_name)s' % locals()) output.add(xml_file_name, zip_file)
def run(self): sync_path = join(BASE_DIR, self.date_str) target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str) s3_cmd = [ 'aws', '--profile', config.aws_profile(), 's3', 'sync', sync_path, target_bucket, '--exclude "*"', '--include "*.zip"', '--include "*schema.json"' ] common.shell_cmd_quiet(' '.join(s3_cmd)) common.shell_cmd_quiet('touch %s', self.output().path)
def run(self): logging.info('Extracting: %s', (self.input().path)) extract_dir = dirname(self.input().path) gsrs_file_name = os.path.basename(self.input().path) gz_filename = os.path.splitext(gsrs_file_name)[0] + ".gz" gsrs_file = join(extract_dir, gsrs_file_name) gz_file = join(extract_dir, gz_filename) os.rename(gsrs_file, gz_file) common.shell_cmd_quiet('gunzip ' + gz_file) os.rename(os.path.splitext(gz_file)[0], os.path.splitext(gz_file)[0] + ".json")
def run(self): src_dir = self.input().path os.system('mkdir -p "%s"' % self.output().path) pattern = join(src_dir, '*.zip') zip_files = glob.glob(pattern) if len(zip_files) == 0: logging.warning('Expected to find one or more daily med SPL files') extract_dir = self.output().path for zip_file in zip_files: common.shell_cmd_quiet( 'unzip -oq -d %(extract_dir)s %(zip_file)s' % locals())
def run(self): common.shell_cmd_quiet('mkdir -p %s', self.local_dir) soup = BeautifulSoup( urllib.request.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if '_human_' in a.text: try: common.download( a['href'], join(self.local_dir, a['href'].split('/')[-1])) except ProcessException as e: logging.error( "Could not download a DailyMed SPL archive: {0}: {1}". format(a['href'], e))
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd_quiet('cp %s %s', schema_file, endpoint_dir)
def run(self): cmd = 'iconv -f %s -t %s -c %s > %s' % \ ('ISO-8859-1//TRANSLIT', 'UTF-8', self.input().path, self.output().path) common.shell_cmd_quiet(cmd) # CSV exported by FDA iRes is often malformed because it can contain multiple columns # with the same name: More Code Info. Most likely iRes does this when the code information is # deemed too large to fit into a single column; but in any case the columns should have been named # distinctly, e.g. "More Code Info 01", "More Code Info 02" etc. # We handle this case here with Pandas and give the columns distinct names. df = pd.read_csv(self.output().path, index_col=False, encoding='utf-8', dtype=str) code_info_columns = list(filter(lambda col: col.startswith('More Code Info'), list(df.columns))) if len(code_info_columns) > 1: df['Code Info All'] = df[code_info_columns].apply( lambda row: ' '.join(list(filter(lambda v: not pd.isna(v), list(row.values)))).strip(), axis=1) df.drop(code_info_columns, axis=1, inplace=True) df.rename(columns={"Code Info All": "More Code Info"}, inplace=True) df.to_csv(self.output().path, encoding='utf-8', index=False, quoting=csv.QUOTE_ALL)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) if self.index_changed_since_last_export(es_client, ep.index_name, target_dir): index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3. flock is required to avoid a race condition when copying the schema file. common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file, schema_file, endpoint_dir)
def map(self, xml_file, value, output): if os.path.getsize(xml_file) > 0: # Oddly enough, some SPL XML files arrive from FDA gzipped, which requires us to take an additional # uncompressing step. filetype = common.shell_cmd_quiet('file %(xml_file)s' % locals()) if "gzip compressed data" in filetype.decode( ) or "DOS/MBR boot sector" in filetype.decode(): # logging.warning("SPL XML is gzipped: " + xml_file) gz_file = xml_file + '.gz' os.rename(xml_file, gz_file) with gzip.open(gz_file, 'rb') as f_in, open(xml_file, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) p = etree.XMLParser(huge_tree=True) try: tree = etree.parse(open(xml_file), parser=p) code = next( iter( tree.xpath( "//ns:document/ns:code[@codeSystem='2.16.840.1.113883.6.1']/@displayName", namespaces=self.NS)), '') if code.lower().find('human') != -1: spl_id = tree.xpath('//ns:document/ns:id/@root', namespaces=self.NS)[0].lower() spl_set_id = tree.xpath('//ns:document/ns:setId/@root', namespaces=self.NS)[0].lower() version = tree.xpath( '//ns:document/ns:versionNumber/@value', namespaces=self.NS)[0] output.add(spl_set_id, { 'spl_id': spl_id, 'version': version }) elif len(code) == 0: logging.warning("Not a drug label SPL file: " + xml_file) except XMLSyntaxError as e: logging.warning("Invalid SPL file: " + xml_file) logging.warning(e) except: logging.error("Error processing SPL file: " + xml_file) traceback.print_exc() raise else: logging.warning("Zero length SPL file: " + xml_file)
def ExtractXMLFromNestedZip(zip_filename, output_dir, exclude_images=True): for child_zip_filename in list_zip_files_in_zip(zip_filename): base_zip = basename(child_zip_filename) target_dir = base_zip.split('.')[0] cmd = 'unzip -j -d %(output_dir)s/%(target_dir)s \ %(zip_filename)s \ %(child_zip_filename)s' % locals() common.shell_cmd_quiet(cmd) cmd = 'unzip %(output_dir)s/%(target_dir)s/%(base_zip)s -d \ %(output_dir)s/%(target_dir)s' % locals() if exclude_images: cmd += ' -x *.jpg' common.shell_cmd_quiet(cmd) common.shell_cmd_quiet('rm %(output_dir)s/%(target_dir)s/%(base_zip)s' % locals())
def map(self, _, value, output): value = value.strip() xml_file = join(self.spl_path, value, value + '.xml') if not os.path.exists(xml_file): logging.info('File does not exist, skipping %s', xml_file) return spl_js = SPL_JS loinc = LOINC cmd = 'node %(spl_js)s %(xml_file)s %(loinc)s' % locals() json_str = '' try: json_str = common.shell_cmd_quiet(cmd) json_obj = json.loads(json_str) if not json_obj.get('set_id'): logging.error('SPL file has no set_id: %s', xml_file) else: output.add(xml_file, json_obj) except: logging.error('Unable to convert SPL XML to JSON: %s', xml_file) logging.error('cmd: %s', cmd) logging.error('json: %s', json_str) logging.error(sys.exc_info()[0]) raise
def run(self): for filename in glob.glob(SPL_S3_DIR + '/*/*.xml'): src_dir = dirname(filename) barcode_target = join(src_dir, 'barcodes') xml_out = join(barcode_target, 'otc-bars.xml') json_out = xml_out.replace('.xml', '.json') if not os.path.exists(xml_out): common.shell_cmd_quiet('mkdir -p %s', barcode_target) # logging.info('Zbarimg on directory %s', src_dir) cmd = 'find %(src_dir)s -name "*.jpg" -size +0\ -exec zbarimg -q --xml {} \; > \ %(xml_out)s' % locals() common.shell_cmd_quiet(cmd) if common.is_older(json_out, xml_out): # logging.info('%s does not exist, producing...', json_out) process_barcodes.XML2JSON(xml_out) common.shell_cmd_quiet('touch %s', self.output().path)
def run(self): cmd = 'iconv -f %s -t %s -c %s > %s' % \ ('ISO-8859-1//TRANSLIT', 'UTF-8', self.input().path, self.output().path) common.shell_cmd_quiet(cmd)
def test_shell_cmd_quiet(self): tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32))) common.shell_cmd_quiet('touch %(tmpFile)s' % locals()) assert common.shell_cmd_quiet('ls %(tmpFile)s' % locals()).startswith(tmpFile)
LOINC = join(RUN_DIR, 'spl/data/sections.csv') SPL_S3_BUCKET = 's3://openfda-data-spl/data/' SPL_STAGING_S3_BUCKET = 's3://openfda-data-spl-staging/' SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync') SPL_INDEX_DIR = config.data_dir('spl/index.db') SPL_JSON_DIR = config.data_dir('spl/json.db') SPL_ANNOTATED_DIR = config.data_dir('spl/annotated.db') DAILY_MED_DIR = config.data_dir('spl/dailymed') DAILY_MED_DOWNLOADS_DIR = config.data_dir('spl/dailymed/raw') DAILY_MED_EXTRACT_DIR = config.data_dir('spl/dailymed/extract') DAILY_MED_FLATTEN_DIR = config.data_dir('spl/dailymed/flatten') DAILY_MED_DOWNLOADS_PAGE = 'https://dailymed.nlm.nih.gov/dailymed/spl-resources-all-drug-labels.cfm' common.shell_cmd_quiet('mkdir -p %s', SPL_S3_LOCAL_DIR) common.shell_cmd_quiet('mkdir -p %s', DAILY_MED_DIR) class DownloadDailyMedSPL(luigi.Task): local_dir = DAILY_MED_DOWNLOADS_DIR def output(self): return luigi.LocalTarget(self.local_dir) def run(self): common.shell_cmd_quiet('mkdir -p %s', self.local_dir) soup = BeautifulSoup( urllib.request.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if '_human_' in a.text:
def map(self, zip_file, value, output): output_dir = self.output().path common.shell_cmd_quiet('mkdir -p %s', output_dir) common.shell_cmd_quiet('7z x "%s" -aoa -bd -y -o%s', zip_file, output_dir)
def test_shell_cmd_quiet(self): tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32))) common.shell_cmd_quiet('touch %(tmpFile)s' % locals()) assert common.shell_cmd_quiet('ls %(tmpFile)s' % locals()).startswith(tmpFile.encode())
import luigi import simplejson as json from bs4 import BeautifulSoup from openfda import common, config, parallel, spl from openfda.annotation_table import unii_harmonization from openfda.spl import process_barcodes, extract from openfda.tasks import DependencyTriggeredTask data_dir = config.data_dir('harmonization') BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD') BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE) SPL_S3_DIR = config.data_dir('spl/s3_sync') TMP_DIR = config.tmp_dir() common.shell_cmd_quiet('mkdir -p %s', data_dir) common.shell_cmd_quiet('mkdir -p %s', BASE_DIR) common.shell_cmd_quiet('mkdir -p %s', TMP_DIR) SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db') DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/' PHARM_CLASS_DOWNLOAD = \ DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip' RXNORM_DOWNLOAD = \ DAILYMED_PREFIX + 'rxnorm_mappings.zip' UNII_DOWNLOAD = \ 'https://fdasis.nlm.nih.gov/srs/download/srs/UNIIs.zip'
def run(self): zip_filename = self.input().path output_dir = self.output().path common.shell_cmd_quiet('mkdir -p %s' % output_dir) ExtractXMLFromNestedZip(zip_filename, output_dir)