Exemplo n.º 1
0
 def run(self):
     output_dir = self.output().path
     common.shell_cmd('mkdir -p %s', output_dir)
     for i in range(len(self.input())):
         input_dir = self.input()[i].path
         download_util.extract_and_clean(input_dir, 'ISO-8859-1//TRANSLIT',
                                         'UTF-8', 'txt')
Exemplo n.º 2
0
    def run(self):
        input_dir = self.input().path
        output_dir = self.output().path
        common.shell_cmd("mkdir -p %s", dirname(output_dir))

        NEEDS_HEADERS = {"estabtypes.txt": ["establishment_type_id", "description"]}

        inputs = []
        for input_file in glob.glob(input_dir + "/*.txt"):
            if basename(input_file) in REMAPPED_FILES:
                continue
            header_key = basename(input_file)
            fieldnames = NEEDS_HEADERS.get(header_key, None)
            inputs.append(
                parallel.Collection.from_glob(
                    input_file,
                    parallel.CSVDictLineInput(
                        delimiter="|", fieldnames=fieldnames, quoting=csv.QUOTE_NONE, escapechar="\\"
                    ),
                )
            )

        parallel.mapreduce(
            inputs=inputs, mapper=TXT2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path
        )
Exemplo n.º 3
0
    def run(self):
        input_dir = self.input().path
        output_dir = self.output().path

        common.shell_cmd('mkdir -p %s', output_dir)
        # TODO(hansnelsen): change to the openfda.parallel version of multiprocess
        pool = multiprocessing.Pool(processes=3)
        for i in range(PARTITIONS):
            partition_dict = {}
            output_filename = join(output_dir, str(i) + '.maude.json')
            # Get all of the files for the current partition
            for filename in glob.glob(input_dir + '/' + str(i) + '.*.txt'):
                for file_type in CATEGORIES:
                    if file_type in filename:
                        logging.info('Using file %s for joining', filename)
                        partition_dict[file_type] = filename

            logging.info('Starting Partition %d', i)
            master_file = partition_dict['mdrfoi']
            patient_file = partition_dict['patient']
            device_file = partition_dict['foidev']
            text_file = partition_dict['foitext']
            pool.apply_async(join_maude.join_maude,
                             (master_file, patient_file, device_file,
                              text_file, output_filename))

        pool.close()
        pool.join()
Exemplo n.º 4
0
    def _run(self):
        output_dir = self.output().path
        common.shell_cmd('mkdir -p %s', output_dir)
        change_log = csv.reader(open(self.change_log_file, 'r'))
        batches = collections.defaultdict(list)

        for row in change_log:
            spl_id, spl_type, spl_date = row

            # Only grab the human and ceullar therapy labels for this index
            valid_types = ['cellular therapy', 'human']
            is_valid = False
            for valid_type in valid_types:
                if spl_type.lower().find(valid_type) != -1:
                    is_valid = True
                    break

            # only process valid document types
            if is_valid:
                # All blank dates to be treated as the week of June 1, 2009
                if not spl_date:
                    spl_date = '20090601120000'
                date = arrow.get(spl_date, 'YYYYMMDDHHmmss')
                batches[date.ceil('week')].append(spl_id)

        for batch_date, ids in batches.items():
            batch_file = '%s.ids' % batch_date.format('YYYYMMDD')
            batch_out = open(join(output_dir, batch_file), 'w')
            unique_ids = list(set(ids))
            batch_out.write('\n'.join(unique_ids))
Exemplo n.º 5
0
 def run(self):
     output_dir = self.output().path
     common.shell_cmd('mkdir -p %s', output_dir)
     input_dir = self.local_dir
     for zip_filename in glob.glob(input_dir + '/*.zip'):
         common.shell_cmd_quiet('unzip -ou %s -d %s', zip_filename,
                                output_dir)
Exemplo n.º 6
0
  def run(self):
    crawl_dir = dirname(dirname(self.output().path))
    common.shell_cmd('mkdir -p %s', dirname(self.output().path))

    manifests = walk_glob('manifest.json', crawl_dir)

    records = []
    for file_name in manifests:
      records.append(json.load(open(file_name)))

    # Default data structure that creates the appropriate structure on the
    # first put so that we can blindly use `+=` when appropriate.
    combined = collections.defaultdict(
      lambda: collections.defaultdict(
        lambda: {
          'export_date': None,
          'partitions': [],
          'total_records': 0
        }
      )
    )

    # Walk over all of the manifests and create a single dictionary
    for row in records:
      for domain, value in row.items():
        for sub, val in value.items():
          combined[domain][sub]['export_date'] = val.get('export_date', '')
          combined[domain][sub]['partitions'] += val.get('partitions', [])
          combined[domain][sub]['total_records'] += val.get('total_records', 0)

    with open(join(self.output().path), 'w') as json_out:
      json.dump(combined, json_out, indent=2)
Exemplo n.º 7
0
    def run(self):
        input_dir = self.input().path
        output_dir = self.output().path
        common.shell_cmd('mkdir -p %s', dirname(output_dir))

        NEEDS_HEADERS = {
            'estabtypes.txt': ['establishment_type_id', 'description']
        }

        inputs = []
        for input_file in glob.glob(input_dir + '/*.txt'):
            if basename(input_file) in REMAPPED_FILES:
                continue
            header_key = basename(input_file)
            fieldnames = NEEDS_HEADERS.get(header_key, None)
            inputs.append(
                parallel.Collection.from_glob(
                    input_file,
                    parallel.CSVDictLineInput(delimiter='|',
                                              fieldnames=fieldnames,
                                              quoting=csv.QUOTE_NONE,
                                              escapechar='\\')))

        parallel.mapreduce(inputs=inputs,
                           mapper=TXT2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
Exemplo n.º 8
0
  def run(self):
    input_dir = self.input().path
    output_dir = self.output().path

    common.shell_cmd('mkdir -p %s', output_dir)
    # TODO(hansnelsen): change to the openfda.parallel version of multiprocess
    pool = multiprocessing.Pool(processes=6)
    for i in range(PARTITIONS):
      partition_dict = {}
      output_filename = join(output_dir, str(i) + '.maude.json')
      # Get all of the files for the current partition
      for filename in glob.glob(input_dir + '/' + str(i) + '.*.txt'):
        for file_type in CATEGORIES:
          if file_type in filename:
            logging.info('Using file %s for joining', filename)
            partition_dict[file_type] = filename

      logging.info('Starting Partition %d', i)
      master_file = partition_dict['mdrfoi']
      patient_file = partition_dict['patient']
      device_file = partition_dict['foidev']
      text_file = partition_dict['foitext']
      pool.apply_async(join_maude.join_maude, (master_file,
                                               patient_file,
                                               device_file,
                                               text_file,
                                               output_filename))

    pool.close()
    pool.join()
Exemplo n.º 9
0
 def run(self):
   common.shell_cmd('mkdir -p %s', self.local_dir)
   soup = BeautifulSoup(urlopen(CAERS_DOWNLOAD_PAGE_URL).read(), 'lxml')
   for a in soup.find_all(title=re.compile('CAERS ASCII.*')):
     if 'Download CAERS ASCII' in re.sub(r'\s', ' ', a.text):
       fileURL = urljoin('https://www.fda.gov', a['href'])
       common.download(fileURL, join(self.output().path, a.attrs['title']+'.csv'))
Exemplo n.º 10
0
  def run(self):
    # Since we only iterate over dates in the umbrella process, we need to
    # skip batch files that do not exist
    output_file = self.output().path
    if not os.path.exists(self.batch):
      common.shell_cmd('touch %s', output_file)
      return

    input_file = self.input()[1].path
    es = elasticsearch.Elasticsearch(self.es_host)
    index_util.start_index_transaction(es, 'druglabel', self.epoch)
    parallel.mapreduce(
      input_collection=parallel.Collection.from_sharded(input_file),
      mapper=index_util.LoadJSONMapper(self.es_host,
                                       'druglabel',
                                       'spl',
                                       self.epoch,
                                       docid_key='set_id',
                                       version_key='version'),
      reducer=parallel.NullReducer(),
      output_prefix='/tmp/loadjson.druglabel',
      num_shards=1,
      map_workers=1)
    index_util.commit_index_transaction(es, 'druglabel')
    common.shell_cmd('touch %s', output_file)
Exemplo n.º 11
0
    def run(self):
        crawl_dir = dirname(dirname(self.output().path))
        common.shell_cmd('mkdir -p %s', dirname(self.output().path))

        manifests = walk_glob('manifest.json', crawl_dir)

        records = []
        for file_name in manifests:
            records.append(json.load(open(file_name)))

        # Default data structure that creates the appropriate structure on the
        # first put so that we can blindly use `+=` when appropriate.
        combined = collections.defaultdict(
            lambda: collections.defaultdict(lambda: {
                'export_date': None,
                'partitions': [],
                'total_records': 0
            }))

        # Walk over all of the manifests and create a single dictionary
        for row in records:
            for domain, value in row.items():
                for sub, val in value.items():
                    combined[domain][sub]['export_date'] = val.get(
                        'export_date', '')
                    combined[domain][sub]['partitions'] += val.get(
                        'partitions', [])
                    combined[domain][sub]['total_records'] += val.get(
                        'total_records', 0)

        with open(join(self.output().path), 'w') as json_out:
            json.dump(combined, json_out, indent=2)
Exemplo n.º 12
0
 def run(self):
     output_dir = dirname(self.output().path)
     common.shell_cmd('mkdir -p %s', output_dir)
     dt = arrow.get(self.batch)
     url = DOWNLOAD_URL.replace('--Y--', str(dt.year))
     url = url.replace('--M--', str(dt.month))
     url = url.replace('--D--', str(dt.day))
     download_to_file_with_retry(url, self.output().path)
Exemplo n.º 13
0
 def run(self):
   common.shell_cmd('mkdir -p %s', dirname(self.output().path))
   input_files = glob.glob(self.input().path + '/*.txt')
   parallel.mapreduce(
     parallel.Collection.from_glob(
       input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')),
     mapper=PMAMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Exemplo n.º 14
0
 def run(self):
   common.shell_cmd('mkdir -p %s', self.local_dir)
   soup = BeautifulSoup(urllib2.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml')
   for a in soup.find_all(href=re.compile('.*.zip')):
     if '_human_' in a.text:
       try:
         common.download(a['href'], join(self.local_dir, a['href'].split('/')[-1]))
       except ProcessException as e:
         logging.error("Could not download a DailyMed SPL archive: {0}: {1}".format(a['href'], e))
Exemplo n.º 15
0
 def run(self):
   output_dir = self.output().path
   common.shell_cmd('mkdir -p %s', output_dir)
   for i in range(len(self.input())):
     input_dir = self.input()[i].path
     download_util.extract_and_clean(input_dir,
                                     'ISO-8859-1//TRANSLIT',
                                     'UTF-8',
                                     'txt')
Exemplo n.º 16
0
 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|', strip_str='\0')),
                        mapper=PMAMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemplo n.º 17
0
 def run(self):
   common.shell_cmd('mkdir -p %s', join(BASE_DIR, 'tmp'))
   files = glob.glob(self.input().path + '/*/*.json')
   parallel.mapreduce(
     parallel.Collection.from_glob(files, parallel.JSONLineInput()),
     mapper=ParallelExportMapper(output_dir=self.output().path),
     reducer=parallel.NullReducer(),
     output_prefix=join(BASE_DIR, 'tmp'),
     output_format=parallel.NullOutput(),
     map_workers=10)
Exemplo n.º 18
0
 def run(self):
     common.shell_cmd('mkdir -p %s', join(BASE_DIR, 'tmp'))
     files = glob.glob(self.input().path + '/*/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(files, parallel.JSONLineInput()),
         mapper=ParallelExportMapper(output_dir=self.output().path),
         reducer=parallel.NullReducer(),
         output_prefix=join(BASE_DIR, 'tmp'),
         output_format=parallel.NullOutput(),
         map_workers=10)
Exemplo n.º 19
0
 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|',
                                   quoting=csv.QUOTE_NONE,
                                   escapechar='\\')),
                        mapper=ClassificationMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemplo n.º 20
0
    def run(self):
        sync_path = join(BASE_DIR, self.date_str)
        target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str)
        s3_cmd = [
            'aws', '--profile',
            config.aws_profile(), 's3', 'sync', sync_path, target_bucket,
            '--exclude "*"', '--include "*.zip"', '--include "*schema.json"'
        ]

        common.shell_cmd(' '.join(s3_cmd))
        common.shell_cmd('touch %s', self.output().path)
Exemplo n.º 21
0
 def run(self):
   common.shell_cmd('mkdir -p %s', dirname(self.output().path))
   input_files = glob.glob(self.input().path + '/*.txt')
   parallel.mapreduce(
     parallel.Collection.from_glob(
       input_files, parallel.CSVDictLineInput(delimiter='|',
                                              quoting=csv.QUOTE_NONE,
                                              escapechar='\\')),
     mapper=ClassificationMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Exemplo n.º 22
0
    def run(self):
        output_dir = self.output().path
        common.shell_cmd("mkdir -p %s", output_dir)
        input_dir = self.input()[0].path
        supplemental_dir = self.input()[1].path
        download_util.extract_and_clean(input_dir, "ISO-8859-1", "UTF-8", "txt")

        # One of the files needs to be remapped from one column (submission_number)
        # to two columns (pma_number and k_number) depending on the prefix.
        file_name = "registration_listing.txt"
        output_file = join(output_dir, "remapped_" + file_name)
        remap_supplemental_files(join(output_dir, file_name), join(supplemental_dir, file_name), output_file)
Exemplo n.º 23
0
 def run(self):
   output_dir = self.output().path
   common.shell_cmd('mkdir -p %s', output_dir)
   date = self.batch
   if date >= CROSSOVER_XML_START_DATE and date <= CROSSOVER_XML_END_DATE:
     url = CROSSOVER_XML_URL
   else:
     url = CURRENT_XML_BASE_URL
   url = url.replace('WEEK', date.strftime('%m%d%Y'))
   file_name = 'enforcementreport.xml'
   xml_file = '%(output_dir)s/%(file_name)s' % locals()
   download_to_file_with_retry(url, xml_file)
Exemplo n.º 24
0
def extract_and_clean(input_dir, source_encoding, target_encoding, file_type):
    ''' A utility function that extracts all of the zip files in a directory and
      converts the files from a source encoding to a target encoding.
  '''
    for zip_filename in glob.glob(input_dir + '/*.zip'):
        txt_name = zip_filename.replace('zip', file_type)
        txt_name = txt_name.replace('raw', 'extracted')
        common.shell_cmd('mkdir -p %s', dirname(txt_name))
        cmd = 'unzip -p %s | iconv -f %s -t %s -c > %s'
        logging.info('Unzipping and converting %s', zip_filename)
        common.shell_cmd(cmd, zip_filename, source_encoding, target_encoding,
                         txt_name)
Exemplo n.º 25
0
 def run(self):
     output_dir = self.output().path
     common.shell_cmd('mkdir -p %s', output_dir)
     date = self.batch
     if date >= CROSSOVER_XML_START_DATE and date <= CROSSOVER_XML_END_DATE:
         url = CROSSOVER_XML_URL
     else:
         url = CURRENT_XML_BASE_URL
     url = url.replace('WEEK', date.strftime('%m%d%Y'))
     file_name = 'enforcementreport.xml'
     xml_file = '%(output_dir)s/%(file_name)s' % locals()
     download_to_file_with_retry(url, xml_file)
Exemplo n.º 26
0
    def run(self):
        sync_path = join(BASE_DIR, self.date_str)
        target_bucket = S3_BASE_BUCKET + '%s/' % self.date_str
        for data_path in self.output():
            s3_cmd = [
                'aws', '--profile',
                config.aws_profile(), 's3', 'sync', sync_path, target_bucket,
                '--exclude "*"', '--include "*.zip"',
                '--include "*schema.json"'
            ]

            common.shell_cmd(' '.join(s3_cmd))
            common.shell_cmd('touch %s', data_path.path)
Exemplo n.º 27
0
    def run(self):
        logging.info('Extracting: %s', (self.input().path))

        extract_dir = dirname(self.input().path)
        gsrs_file_name = os.path.basename(self.input().path)
        gz_filename = os.path.splitext(gsrs_file_name)[0] + ".gz"
        gsrs_file = join(extract_dir, gsrs_file_name)

        gz_file = join(extract_dir, gz_filename)
        os.rename(gsrs_file, gz_file)
        common.shell_cmd('gunzip ' + gz_file)
        os.rename(
            os.path.splitext(gz_file)[0],
            os.path.splitext(gz_file)[0] + ".json")
Exemplo n.º 28
0
  def run(self):
    output_dir = self.output().path
    common.shell_cmd('mkdir -p %s', output_dir)
    input_dir = self.input()[0].path
    supplemental_dir = self.input()[1].path
    download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt')

    # One of the files needs to be remapped from one column (submission_number)
    # to two columns (pma_number and k_number) depending on the prefix.
    file_name = 'registration_listing.txt'
    output_file = join(output_dir, 'remapped_' + file_name)
    remap_supplemental_files(join(output_dir, file_name),
                             join(supplemental_dir, file_name),
                             output_file)
Exemplo n.º 29
0
  def run(self):
    zip_filename = self.input().path
    output_filename = self.output().path
    output_dir = dirname(output_filename)
    common.shell_cmd('mkdir -p %s' % output_dir)
    cmd = 'unzip -o %(zip_filename)s \
                    -d %(output_dir)s' % locals()
    common.shell_cmd(cmd)

    # UNII filename varies; find and rename to a standardized name.
    # It is now a tab-delimited CSV instead of an XML as before.
    for file in glob.glob(join(output_dir, 'UNII*Names*.txt')):
      logging.info('Renaming %s', file)
      os.rename(file, output_filename)
Exemplo n.º 30
0
    def _run(self):
        shutil.rmtree(self.output().path, ignore_errors=True)
        os.makedirs(self.output().path)
        # Get all of the endpoints served by this index
        # Create an `EndpointExport` object for each endpoint in order to export
        # each endpoint properly.
        #
        # Endpoint exports can be:
        #   date range based (quarterly output)
        #   filter based (index serves many endpoints)
        #   vanilla (endpoint is 1 to 1 with index and it is exported all at once)
        for endpoint, index_name in ENDPOINT_INDEX_MAP.items():
            endpoint_batches = []
            chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS)
            if endpoint in RANGE_ENDPOINT_MAP:
                params = RANGE_ENDPOINT_MAP[endpoint]
                params['chunks'] = chunks
                endpoint_batches = _make_date_range_endpoint_batch(
                    endpoint, params)
            elif endpoint in FILTERED_ENPOINT_MAP:
                params = FILTERED_ENPOINT_MAP[endpoint]
                query = EndpointExport.build_term_filter(**params)
                endpoint_batches.append(
                    EndpointExport(endpoint, query=query, chunks=chunks))
            else:
                endpoint_batches.append(EndpointExport(endpoint,
                                                       chunks=chunks))

            # This is a hack to overcome the shortcoming of the parallel library of
            # only having one mapper process for a tiny, single file input. Since we
            # want to execute these endpoint batches in parallel, we write each task
            # to its own file. It will create a mapper for each file.
            for ep in endpoint_batches:
                partition = ep.partition if ep.partition else 'all'

                if 'enforcement' in ep.endpoint:
                    partition = ep.endpoint.replace('enforcement',
                                                    '').replace('/', '')
                elif 'label' in ep.endpoint:
                    partition = ep.endpoint.replace('label',
                                                    '').replace('/', '')

                output_dir = join(self.output().path, index_name)
                common.shell_cmd('mkdir -p %s', output_dir)
                file_name = join(output_dir, partition + '.json')

                with open(file_name, 'w') as json_out:
                    json_dict = json.dumps(ep.__dict__)
                    json_out.write(json_dict + '\n')
Exemplo n.º 31
0
def extract_and_clean(input_dir, source_encoding, target_encoding, file_type):
  ''' A utility function that extracts all of the zip files in a directory and
      converts the files from a source encoding to a target encoding.
  '''
  for zip_filename in glob.glob(input_dir + '/*.zip'):
    txt_name = zip_filename.replace('zip', file_type)
    txt_name = txt_name.replace('raw', 'extracted')
    common.shell_cmd('mkdir -p %s', dirname(txt_name))
    cmd = 'unzip -p %s | iconv -f %s -t %s -c > %s'
    logging.info('Unzipping and converting %s', zip_filename)
    common.shell_cmd(cmd,
                     zip_filename,
                     source_encoding,
                     target_encoding,
                     txt_name)
Exemplo n.º 32
0
    def run(self):
        schema_file = self.get_schemafile()
        assert os.path.exists(
            schema_file
        ), 'No schema file available for index %s' % self.index_name

        es_client = elasticsearch.Elasticsearch(config.es_host())

        endpoints = self.get_endpoints()
        # Get all of the endpoints served by this index
        # Create an `EndpointExport` object for each endpoint in order to export
        # each endpoint properly.
        #
        # Endpoint exports can be:
        #   date range based (quarterly output)
        #   filter based (index serves many endpoints)
        #   vanilla (endpoint is 1 to 1 with index and it is exported all at once)
        endpoint_batches = []
        for endpoint in endpoints:
            chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS)
            if endpoint in RANGE_ENDPOINT_MAP:
                params = RANGE_ENDPOINT_MAP[endpoint]
                params['chunks'] = chunks
                endpoint_batches = _make_date_range_endpoint_batch(
                    endpoint, params)
            elif endpoint in FILTERED_ENPOINT_MAP:
                params = FILTERED_ENPOINT_MAP[endpoint]
                query = EndpointExport.build_term_filter(**params)
                endpoint_batches.append(
                    EndpointExport(endpoint, query=query, chunks=chunks))
            else:
                endpoint_batches.append(EndpointExport(endpoint,
                                                       chunks=chunks))

        # Dump each of the `EndpointExport` objects in the list
        for ep in endpoint_batches:
            # The output_dir will be the same for all outputs, once you factor out
            # the endpoint, so we can safely look at the first one only.
            output_dir = dirname(dirname(self.output()[0].path))
            endpoint_dir = join(output_dir, ep.endpoint[1:])
            index_util.dump_index(es_client,
                                  ep.index_name,
                                  ep.endpoint,
                                  join(endpoint_dir, ep.partition),
                                  cleaner=omit_internal_keys,
                                  query=ep.query,
                                  chunks=ep.chunks)
            common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
Exemplo n.º 33
0
  def run(self):
    sync_path = join(BASE_DIR, self.date_str)
    target_bucket =  's3://%s/%s/' % (self.download_bucket, self.date_str)
    s3_cmd = [
      'aws',
      '--profile',
      config.aws_profile(),
      's3',
      'sync',
      sync_path,
      target_bucket,
      '--exclude "*"',
      '--include "*.zip"',
      '--include "*schema.json"']

    common.shell_cmd(' '.join(s3_cmd))
    common.shell_cmd('touch %s', self.output().path)
Exemplo n.º 34
0
    def run(self):
        output_dir = dirname(self.output().path)
        common.shell_cmd('mkdir -p %s', output_dir)

        end = arrow.get(self.batch)
        start = end.shift(days=-6)

        id_list = self._fetch_ids(start, end)
        if len(id_list) >= 500:
            # We have tried to get the entire list of weekly results in one shot, but apparently there are more than 500
            # hits and CDRH recall search does not support more than 500 results per search and does not support paging either.
            # This occurrence is rare, but when it does happen we need to re-retrieve the results day by day.
            for r in arrow.Arrow.range('day', start, end):
                id_list = id_list + self._fetch_ids(r, r)

        df = pd.DataFrame(data=list(set(id_list)), columns=['id'])
        df.to_csv(self.output().path, index=False)
Exemplo n.º 35
0
 def map(self, key, value, output):
   es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
   ep = common.ObjectDict(value)
   schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
   endpoint_dir = join(self.output_dir, ep.endpoint[1:])
   target_dir = join(endpoint_dir, ep.partition)
   common.shell_cmd('mkdir -p %s', target_dir)
   index_util.dump_index(es_client,
                         ep.index_name,
                         ep.endpoint,
                         target_dir,
                         cleaner=omit_internal_keys,
                         query=ep.query,
                         chunks=ep.chunks)
   # Copy the current JSON schema to the zip location so that it is included
   # in the sync to s3
   common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
Exemplo n.º 36
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host())
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3
     common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
Exemplo n.º 37
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3. flock is required to avoid a race condition when copying the schema file.
     common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file,
                            schema_file, endpoint_dir)
Exemplo n.º 38
0
 def run(self):
     output_file = self.output().path
     input_file = self.input()[1].path
     es = elasticsearch.Elasticsearch(self.es_host)
     index_util.start_index_transaction(es, 'recall', self.epoch)
     parallel.mapreduce(
         input_collection=parallel.Collection.from_sharded(input_file),
         mapper=index_util.LoadJSONMapper(self.es_host,
                                          'recall',
                                          'enforcementreport',
                                          self.epoch,
                                          docid_key='@id',
                                          version_key='@version'),
         reducer=parallel.NullReducer(),
         output_prefix='/tmp/loadjson.recall',
         num_shards=1,
         map_workers=1)
     index_util.commit_index_transaction(es, 'recall')
     common.shell_cmd('touch %s', output_file)
Exemplo n.º 39
0
 def run(self):
   output_file = self.output().path
   input_file = self.input()[1].path
   es = elasticsearch.Elasticsearch(self.es_host)
   index_util.start_index_transaction(es, 'recall', self.epoch)
   parallel.mapreduce(
     input_collection=parallel.Collection.from_sharded(input_file),
     mapper=index_util.LoadJSONMapper(self.es_host,
                                      'recall',
                                      'enforcementreport',
                                      self.epoch,
                                      docid_key='@id',
                                      version_key='@version'),
     reducer=parallel.NullReducer(),
     output_prefix='/tmp/loadjson.recall',
     num_shards=1,
     map_workers=1)
   index_util.commit_index_transaction(es, 'recall')
   common.shell_cmd('touch %s', output_file)
Exemplo n.º 40
0
  def _run(self):
    common.shell_cmd('mkdir -p %s', self.output().path)
    # Get all of the endpoints served by this index
    # Create an `EndpointExport` object for each endpoint in order to export
    # each endpoint properly.
    #
    # Endpoint exports can be:
    #   date range based (quarterly output)
    #   filter based (index serves many endpoints)
    #   vanilla (endpoint is 1 to 1 with index and it is exported all at once)
    for endpoint, index_name in ENDPOINT_INDEX_MAP.items():
      endpoint_batches = []
      chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS)
      if endpoint in RANGE_ENDPOINT_MAP:
        params = RANGE_ENDPOINT_MAP[endpoint]
        params['chunks'] = chunks
        endpoint_batches = _make_date_range_endpoint_batch(endpoint, params)
      elif endpoint in FILTERED_ENPOINT_MAP:
        params = FILTERED_ENPOINT_MAP[endpoint]
        query = EndpointExport.build_term_filter(**params)
        endpoint_batches.append(
          EndpointExport(endpoint, query=query, chunks=chunks)
        )
      else:
        endpoint_batches.append(EndpointExport(endpoint, chunks=chunks))

      # This is a hack to overcome the shortcoming of the parallel library of
      # only having one mapper process for a tiny, single file input. Since we
      # want to execute these endpoint batches in parallel, we write each task
      # to its own file. It will create a mapper for each file.
      for ep in endpoint_batches:
        partition = ep.partition if ep.partition else 'all'

        if 'enforcement' in ep.endpoint:
          partition = ep.endpoint.replace('enforcement', '').replace('/', '')

        output_dir = join(self.output().path, index_name)
        common.shell_cmd('mkdir -p %s', output_dir)
        file_name = join(output_dir, partition + '.json')

        with open(file_name, 'w') as json_out:
          json_dict = json.dumps(ep.__dict__)
          json_out.write(json_dict + '\n')
Exemplo n.º 41
0
    def _run(self):
        output_dir = self.output().path
        common.shell_cmd("mkdir -p %s", output_dir)
        change_log = csv.reader(open(self.change_log_file, "r"))
        batches = collections.defaultdict(list)

        for row in change_log:
            spl_id, spl_type, spl_date = row
            # Only grab the human labels for this index
            if spl_type.lower().find("human") != -1:
                # All blank dates to be treated as the week of June 1, 2009
                if not spl_date:
                    spl_date = "20090601120000"
                date = arrow.get(spl_date, "YYYYMMDDHHmmss")
                batches[date.ceil("week")].append(spl_id)

        for batch_date, ids in batches.items():
            batch_file = "%s.ids" % batch_date.format("YYYYMMDD")
            batch_out = open(join(output_dir, batch_file), "w")
            unique_ids = list(set(ids))
            batch_out.write("\n".join(unique_ids))
Exemplo n.º 42
0
    def _run(self):
        output_dir = self.output().path
        common.shell_cmd('mkdir -p %s', output_dir)
        change_log = csv.reader(open(self.change_log_file, 'r'))
        batches = collections.defaultdict(list)

        for row in change_log:
            spl_id, spl_type, spl_date = row
            # Only grab the human labels for this index
            if spl_type.lower().find('human') != -1:
                # All blank dates to be treated as the week of June 1, 2009
                if not spl_date:
                    spl_date = '20090601120000'
                date = arrow.get(spl_date, 'YYYYMMDDHHmmss')
                batches[date.ceil('week')].append(spl_id)

        for batch_date, ids in batches.items():
            batch_file = '%s.ids' % batch_date.format('YYYYMMDD')
            batch_out = open(join(output_dir, batch_file), 'w')
            unique_ids = list(set(ids))
            batch_out.write('\n'.join(unique_ids))
Exemplo n.º 43
0
  def run(self):
    for filename in glob.glob(SPL_S3_DIR + '/*/*.xml'):
      src_dir = dirname(filename)
      barcode_target = join(src_dir, 'barcodes')
      xml_out = join(barcode_target, 'otc-bars.xml')
      json_out = xml_out.replace('.xml', '.json')

      if not os.path.exists(xml_out):
        common.shell_cmd('mkdir -p %s', barcode_target)
        logging.info('Zbarimg on directory %s', src_dir)
        cmd = 'find %(src_dir)s -name "*.jpg" -size +0\
                                -exec zbarimg -q --xml {} \; > \
                    %(xml_out)s' % locals()
        os.system(cmd)

      if common.is_older(json_out, xml_out):
        logging.info('%s does not exist, producing...', json_out)
        process_barcodes.XML2JSON(xml_out)
      else:
        logging.debug('%s already exists, skipping', xml_out)
    common.shell_cmd('touch %s', self.output().path)
Exemplo n.º 44
0
    def run(self):
        for filename in glob.glob(SPL_S3_DIR + '/*/*.xml'):
            src_dir = dirname(filename)
            barcode_target = join(src_dir, 'barcodes')
            xml_out = join(barcode_target, 'otc-bars.xml')
            json_out = xml_out.replace('.xml', '.json')

            if not os.path.exists(xml_out):
                common.shell_cmd('mkdir -p %s', barcode_target)
                logging.info('Zbarimg on directory %s', src_dir)
                cmd = 'find %(src_dir)s -name "*.jpg" -size +0\
                                -exec zbarimg -q --xml {} \; > \
                    %(xml_out)s' % locals()
                os.system(cmd)

            if common.is_older(json_out, xml_out):
                logging.info('%s does not exist, producing...', json_out)
                process_barcodes.XML2JSON(xml_out)
            else:
                logging.debug('%s already exists, skipping', xml_out)
        common.shell_cmd('touch %s', self.output().path)
Exemplo n.º 45
0
  def _run(self):
    output_dir = self.output().path
    common.shell_cmd('mkdir -p %s', output_dir)
    change_log = csv.reader(open(self.change_log_file, 'r'))
    batches = collections.defaultdict(list)

    for row in change_log:
      spl_id, spl_type, spl_date = row
      # Only grab the human labels for this index
      if spl_type.lower().find('human') != -1:
        # All blank dates to be treated as the week of June 1, 2009
        if not spl_date:
          spl_date = '20090601120000'
        date = arrow.get(spl_date, 'YYYYMMDDHHmmss')
        batches[date.ceil('week')].append(spl_id)

    for batch_date, ids in batches.items():
      batch_file = '%s.ids' % batch_date.format('YYYYMMDD')
      batch_out = open(join(output_dir, batch_file), 'w')
      unique_ids = list(set(ids))
      batch_out.write('\n'.join(unique_ids))
Exemplo n.º 46
0
  def run(self):
    output_dir = self.output().path
    common.shell_cmd('mkdir -p %s', output_dir)
    input_dir = self.input()[0].path
    supplemental_dir = self.input()[1].path
    download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt')

    # One of the files needs to be remapped from one column (submission_number)
    # to two columns (pma_number and k_number) depending on the prefix.
    file_name = 'registration_listing.txt'
    output_file = join(output_dir, 'remapped_' + file_name)
    remap_supplemental_files(join(output_dir, file_name),
                             join(supplemental_dir, file_name),
                             output_file)

    # There are a handful of files with floats for keys
    # This step can be removed once it is fixed on the source system.
    for fix_file in self.problem_files:
      with open(join(output_dir, fix_file), 'r') as needs_fixing:
        lines = needs_fixing.readlines()
      with open(join(output_dir, fix_file), 'w') as gets_fixing:
        for line in lines:
          gets_fixing.write(re.sub(r'\.0', '', line))
Exemplo n.º 47
0
 def run(self):
   output_dir = self.output().path
   common.shell_cmd('mkdir -p %s', output_dir)
   for i in range(len(self.input())):
     input_dir = self.input()[i].path
     for zip_filename in glob.glob(input_dir + '/*.zip'):
       txt_name = zip_filename.replace('zip', 'txt')
       txt_name = txt_name.replace('raw', 'extracted')
       common.shell_cmd('mkdir -p %s', dirname(txt_name))
       cmd = 'unzip -p %s | iconv -f "ISO-8859-1//TRANSLIT" -t UTF8 -c > %s'
       logging.info('Unzipping and converting %s', zip_filename)
       common.shell_cmd(cmd, zip_filename, txt_name)
Exemplo n.º 48
0
import arrow
import simplejson as json

from openfda.tasks import DependencyTriggeredTask
from openfda import common, config, parallel, spl
from openfda.annotation_table import unii_harmonization
from openfda.spl import process_barcodes, extract

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
data_dir = config.data_dir('harmonization')
BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD')
BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE)
SPL_S3_DIR = config.data_dir('spl/s3_sync')
TMP_DIR = config.tmp_dir()

common.shell_cmd('mkdir -p %s', data_dir)
common.shell_cmd('mkdir -p %s', BASE_DIR)
common.shell_cmd('mkdir -p %s', TMP_DIR)

SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db')
DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/'

PHARM_CLASS_DOWNLOAD = \
  DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip'

RXNORM_DOWNLOAD = \
  DAILYMED_PREFIX + 'rxnorm_mappings.zip'

NDC_DOWNLOAD_PAGE = \
  'http://www.fda.gov/drugs/informationondrugs/ucm142438.htm'
Exemplo n.º 49
0
    def run(self):
        input_dir = join(self.input().path, 'events')
        output_dir = self.output().path
        fh_dict = {}
        common.shell_cmd('mkdir -p %s', output_dir)

        # Headers need to be written to the start of each partition. The patient
        # and foitext headers are manually created. The headers for mdrfoi and
        # foidev are detected from the source and placed into the header dictionary.
        header = {}
        header['patient'] = PATIENT_KEYS
        header['foitext'] = TEXT_KEYS
        header['mdrfoi'] = MDR_KEYS
        header['foidev'] = DEVICE_KEYS

        for i in range(PARTITIONS):
            for category in CATEGORIES:
                filename = str(i) + '.' + category + '.txt'
                filename = join(output_dir, filename)
                logging.info('Creating file handles for writing %s', filename)
                output_handle = open(filename, 'w')
                csv_writer = csv.writer(output_handle, delimiter='|')
                csv_writer.writerow(header[category])
                fh_dict[category + str(i)] = output_handle

        # Because we download all zips from the site, we need to ignore some of the
        # files for the partitioning process. Remove if files are excluded from
        # download.
        for filename in glob.glob(input_dir + '/*.txt'):
            logging.info('Processing: %s', filename)
            skip = False
            for ignore in IGNORE_FILES:
                if ignore in filename:
                    skip = True

            if skip:
                logging.info('Skipping: %s', filename)
                continue

            for category in CATEGORIES:
                if category in filename:
                    file_category = category

            # MAUDE files do not escape quote characters, we just hope that no
            # pipe characters occur in records...
            file_handle = csv.reader(open(filename, 'r'),
                                     quoting=csv.QUOTE_NONE,
                                     delimiter='|')
            partioned = collections.defaultdict(list)
            for i, row in enumerate(file_handle):
                # skip header rows
                if (i == 0) and ('MDR_REPORT_KEY' in row): continue

                # Only work with rows that have data and the first column is a number
                if row and row[0].isdigit():
                    partioned[int(row[0]) % PARTITIONS].append(row)
                else:
                    logging.warn('Skipping row: %s', row)

            for partnum, rows in partioned.iteritems():
                output_handle = fh_dict[file_category + str(partnum)]
                csv_writer = csv.writer(output_handle, delimiter='|')
                logging.info('Writing: %s %s %s %s', partnum,
                             file_category + str(partnum), output_handle,
                             len(rows))
                csv_writer.writerows(rows)
Exemplo n.º 50
0
 def run(self):
   output_dir = self.output().path
   common.shell_cmd('mkdir -p %s', output_dir)
   input_dir = self.local_dir
   for zip_filename in glob.glob(input_dir + '/*.zip'):
     common.shell_cmd('unzip -ou %s -d %s', zip_filename, output_dir)
Exemplo n.º 51
0
 def run(self):
   output_dir = self.output().path
   common.shell_cmd('mkdir -p %s', output_dir)
   input_dir = self.input().path
   download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt')
Exemplo n.º 52
0
import elasticsearch
import luigi
import requests
import simplejson as json

from openfda import common, config, download_util, elasticsearch_requests, index_util, parallel

from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA,
  DeviceAnnotateMapper)
from openfda.device_pma import transform
from openfda.tasks import AlwaysRunTask

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
# A directory for holding files that track Task state
META_DIR = config.data_dir('device_pma/meta')
common.shell_cmd('mkdir -p %s', META_DIR)

DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip'

class DownloadPMA(luigi.Task):
  def requires(self):
    return []

  def output(self):
    return luigi.LocalTarget(config.data_dir('device_pma/raw'))

  def run(self):
    output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1])
    common.download(DEVICE_PMA_ZIP, output_filename)

class ExtractAndCleanDownloadsPMA(luigi.Task):
Exemplo n.º 53
0
import csv
import re
import logging
import os
from os.path import dirname, join

import arrow
import datetime
import luigi

from openfda import common, config, index_util, parallel

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = config.data_dir('caers')
common.shell_cmd('mkdir -p %s', BASE_DIR)

S3_BUCKET = 's3://openfda-data-caers/'
S3_LOCAL_DIR = config.data_dir('caers/s3_sync')
# TODO(hansnelsen): initiate and resolve naming convention for this file and
#                   s3 bucket. Currently, the file is downloaded from
#                   s3://openfda-lonnie/caers/ (the naming of this file is
#                   not consistent). The pipeline engineer downloads it, renames
#                   it and then uploaded manually to the above bucket.
CAERS_FILE = 'caers.csv'
logging.info(S3_LOCAL_DIR, 'dir')
common.shell_cmd('mkdir -p %s', S3_LOCAL_DIR)

RENAME_MAP = {
  'Report #': 'report_number',
  'Created Date': 'date_created',
  'Event Start Date': 'date_started',
Exemplo n.º 54
0
 def test_shell_cmd(self):
   tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32)))
   common.shell_cmd('touch %(tmpFile)s' % locals())
   assert len(common.shell_cmd('ls %(tmpFile)s' % locals())) > 0
   assert common.shell_cmd('ls %(tmpFile)s' % locals()).startswith(tmpFile)
Exemplo n.º 55
0
  def run(self):
    input_dir = join(self.input().path, 'events')
    output_dir = self.output().path
    fh_dict = {}
    common.shell_cmd('mkdir -p %s', output_dir)

    # Headers need to be written to the start of each partition. The patient
    # and foitext headers are manually created. The headers for mdrfoi and
    # foidev are detected from the source and placed into the header dictionary.
    header = {}
    header['patient'] = PATIENT_KEYS
    header['foitext'] = TEXT_KEYS
    header['mdrfoi'] = MDR_KEYS
    header['foidev'] = DEVICE_KEYS

    for i in range(PARTITIONS):
      for category in CATEGORIES:
        filename = str(i) + '.' + category + '.txt'
        filename = join(output_dir, filename)
        logging.info('Creating file handles for writing %s', filename)
        output_handle = open(filename, 'w')
        csv_writer = csv.writer(output_handle, delimiter='|')
        csv_writer.writerow(header[category])
        fh_dict[category + str(i)] = output_handle

    # Because we download all zips from the site, we need to ignore some of the
    # files for the partitioning process. Remove if files are excluded from
    # download.
    for filename in glob.glob(input_dir + '/*.txt'):
      logging.info('Processing: %s', filename)
      skip = False
      for ignore in IGNORE_FILES:
        if ignore in filename:
          skip = True

      if skip:
        logging.info('Skipping: %s', filename)
        continue

      for category in CATEGORIES:
        if category in filename:
          file_category = category

      # MAUDE files do not escape quote characters, we just hope that no
      # pipe characters occur in records...
      file_handle = csv.reader(open(filename, 'r'),
                               quoting=csv.QUOTE_NONE,
                               delimiter='|')
      partioned = collections.defaultdict(list)
      for i, row in enumerate(file_handle):
        # skip header rows
        if (i == 0) and ('MDR_REPORT_KEY' in row): continue

        # Only work with rows that have data and the first column is a number
        if row and row[0].isdigit():
          partioned[int(row[0]) % PARTITIONS].append(row)
        else:
          logging.warn('Skipping row: %s', row)

      for partnum, rows in partioned.iteritems():
        output_handle = fh_dict[file_category + str(partnum)]
        csv_writer = csv.writer(output_handle, delimiter='|')
        logging.info('Writing: %s %s %s %s',
                     partnum,
                     file_category + str(partnum),
                     output_handle,
                     len(rows))
        csv_writer.writerows(rows)
Exemplo n.º 56
0
from bs4 import BeautifulSoup
import elasticsearch
import luigi
import pandas
import requests
import simplejson as json
import urllib2

from openfda import common, config, elasticsearch_requests, index_util, parallel
from openfda import download_util
from openfda.index_util import AlwaysRunTask, ResetElasticSearch
from openfda.device_harmonization.pipeline import Harmonized2OpenFDA, DeviceAnnotateMapper

RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = "./data/registration"
common.shell_cmd("mkdir -p %s", BASE_DIR)
# A directory for holding files that track Task state
META_DIR = join(BASE_DIR, "meta")
common.shell_cmd("mkdir -p %s", META_DIR)

DEVICE_REG_PAGE = (
    "http://www.fda.gov/MedicalDevices/"
    "DeviceRegulationandGuidance/HowtoMarketYourDevice/"
    "RegistrationandListing/ucm134495.htm"
)

S3_BUCKET = "s3://openfda-data-reglist/"
S3_LOCAL_DIR = join(BASE_DIR, "s3_sync")

common.shell_cmd("mkdir -p %s", S3_LOCAL_DIR)
Exemplo n.º 57
0
import arrow
import elasticsearch
import luigi

from openfda import common, elasticsearch_requests, index_util, parallel
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.index_util import AlwaysRunTask
from openfda.spl import annotate
from openfda.parallel import IdentityReducer


RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
BASE_DIR = './data'
META_DIR = join(BASE_DIR, 'spl/meta')
# Ensure meta directory is available for task tracking
common.shell_cmd('mkdir -p %s', META_DIR)

SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js')
LOINC = join(RUN_DIR, 'spl/data/sections.csv')

SPL_S3_BUCKET = 's3://openfda.spl.data/data/'
SPL_S3_LOCAL_DIR = join(BASE_DIR, 'spl/s3_sync')
SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, 'change_log/SPLDocuments.csv')
SPL_BATCH_DIR = join(META_DIR, 'batch')
SPL_PROCESS_DIR = join(BASE_DIR, 'spl/batches')

common.shell_cmd('mkdir -p %s', SPL_S3_LOCAL_DIR)
common.shell_cmd('mkdir -p %s', SPL_PROCESS_DIR)

ES_HOST = luigi.Parameter('localhost:9200', is_global=True)
SPL_S3_PROFILE = luigi.Parameter(default='openfda', is_global=True)
Exemplo n.º 58
0
import arrow
import elasticsearch
import luigi

from openfda import config, common, elasticsearch_requests, index_util, parallel
from openfda.annotation_table.pipeline import CombineHarmonization
from openfda.tasks import AlwaysRunTask
from openfda.spl import annotate
from openfda.parallel import IdentityReducer


RUN_DIR = dirname(dirname(os.path.abspath(__file__)))
META_DIR = config.data_dir("spl/meta")
# Ensure meta directory is available for task tracking
common.shell_cmd("mkdir -p %s", META_DIR)

SPL_JS = join(RUN_DIR, "spl/spl_to_json.js")
LOINC = join(RUN_DIR, "spl/data/sections.csv")

SPL_S3_BUCKET = "s3://openfda-data-spl/data/"
SPL_S3_LOCAL_DIR = config.data_dir("spl/s3_sync")
SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, "change_log/SPLDocuments.csv")
SPL_BATCH_DIR = join(META_DIR, "batch")
SPL_PROCESS_DIR = config.data_dir("spl/batches")

common.shell_cmd("mkdir -p %s", SPL_S3_LOCAL_DIR)
common.shell_cmd("mkdir -p %s", SPL_PROCESS_DIR)


class SyncS3SPL(luigi.Task):