Exemplo n.º 1
0
    def run(self):
        # Read the entire packaging DB in memory for speed.
        package_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()

        parallel.mapreduce(
            parallel.Collection.from_sharded(self.input()[1].path),
            mapper=ProductAndPackagingMergingMapper(package_db=package_db),
            reducer=parallel.IdentityReducer(),
            output_prefix=self.output().path,
            num_shards=1)
Exemplo n.º 2
0
 def run(self):
     harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()
     json_glob = glob.glob(self.input()[1].path + '/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()),
         mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         num_shards=10,
         map_workers=5)
Exemplo n.º 3
0
 def run(self):
     input_dir = self.input().path
     for xml_filename in glob.glob('%(input_dir)s/*.xml' % locals()):
         parallel.mapreduce(input_collection=parallel.Collection.from_glob(
             xml_filename, parallel.XMLDictInput),
                            mapper=XML2JSONMapper(),
                            reducer=parallel.IdentityReducer(),
                            output_prefix=self.output().path,
                            num_shards=1,
                            map_workers=1)
Exemplo n.º 4
0
    def run(self):
        input_db = self.input()[0].path
        harmonized_file = self.input()[1].path

        parallel.mapreduce(parallel.Collection.from_sharded(input_db),
                           mapper=annotate.AnnotateMapper(harmonized_file),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=4,
                           map_workers=1)
Exemplo n.º 5
0
  def run(self):
    input_db = self.input()[0].path
    harmonized_file = self.input()[1].path

    parallel.mapreduce(
      parallel.Collection.from_sharded(input_db),
      mapper=AnnotateMapper(harmonized_file),
      reducer=parallel.IdentityReducer(),
      output_prefix=self.output().path,
      num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
Exemplo n.º 6
0
 def run(self):
     input_dir = dirname(self.input().path)
     for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()):
         parallel.mapreduce(parallel.Collection.from_glob(
             csv_filename, parallel.CSVDictLineInput()),
                            mapper=CSV2JSONMapper(),
                            reducer=parallel.IdentityReducer(),
                            output_prefix=self.output().path,
                            num_shards=1,
                            map_workers=8)
Exemplo n.º 7
0
 def run(self):
     csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict()
     parallel.mapreduce(
         parallel.Collection.from_glob(self.input()[0].path,
                                       parallel.CSVDictLineInput()),
         mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         map_workers=10,
         num_shards=10)  # Do not hit fda.gov too hard here.
Exemplo n.º 8
0
    def run(self):
        with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin:
            rows = (line.split('\t') for line in fin)
            doc_lookup = {row[0]: row[1] for row in rows}

        parallel.mapreduce(parallel.Collection.from_glob(
            join(self.input().path, 'TE.txt'),
            parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=TE2JSONMapper(doc_lookup=doc_lookup),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
Exemplo n.º 9
0
    def run(self):
        input_shards = []
        input_dir = self.input().path
        for xml_filename in glob.glob(input_dir + '/*.xml'):
            input_shards.append(xml_filename)

        parallel.mapreduce(parallel.Collection.from_list(input_shards),
                           mapper=XML2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=len(input_shards))
Exemplo n.º 10
0
 def run(self):
     common.shell_cmd('mkdir -p %s', dirname(self.output().path))
     input_files = glob.glob(self.input().path + '/*.txt')
     parallel.mapreduce(parallel.Collection.from_glob(
         input_files,
         parallel.CSVDictLineInput(delimiter='|',
                                   quoting=csv.QUOTE_NONE,
                                   escapechar='\\')),
                        mapper=ClassificationMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemplo n.º 11
0
 def _run(self):
     json_dir = self.input()['data'].path
     input_glob = glob.glob(json_dir + '/*.json')
     for file_name in input_glob:
         logging.info('Running file %s', file_name)
         parallel.mapreduce(
             parallel.Collection.from_glob(file_name,
                                           parallel.JSONLineInput()),
             mapper=index_util.ReloadJSONMapper(config.es_host(),
                                                self.index_name, 'maude'),
             reducer=parallel.IdentityReducer(),
             output_format=parallel.NullOutput(),
             output_prefix='/tmp/loadjson.' + self.index_name)
Exemplo n.º 12
0
    def test_identity(self):
        os.system('rm -rf /tmp/test-identity*')
        source_files = ['/tmp/test-identity-%d' % i for i in range(10)]
        for f in source_files:
            os.system('touch "%s"' % f)

        source = parallel.Collection(source_files, parallel.FilenameInput)
        parallel.mapreduce(source, parallel.IdentityMapper(),
                           parallel.IdentityReducer(), '/tmp/test-identity', 2)

        results = sorted(list(parallel.ShardedDB.open('/tmp/test-identity/')))
        for i in range(10):
            key, value = results[i]
            assert key == '/tmp/test-identity-%d' % i, results[i]
            assert value == ''
Exemplo n.º 13
0
    def run(self):
        input_shards = []
        input_dir = self.input().path
        for subdir, dirs, files in os.walk(input_dir):
            for file in files:
                if file.endswith('.xml'):
                    if not file in NULLIFIED:
                        input_shards.append(os.path.join(subdir, file))
                    else:
                        logging.info("Skipping a nullified case: " + file)

        parallel.mapreduce(parallel.Collection.from_list(input_shards),
                           mapper=XML2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path)
Exemplo n.º 14
0
  def run(self):
    applications_db = self.input()[0].path
    products_db = self.input()[1].path
    applications_docs_db = self.input()[2].path
    submissions_db = self.input()[3].path
    submissions_property_type_db = self.input()[4].path
    marketing_status = self.input()[5].path
    te_db = self.input()[6].path

    parallel.mapreduce(
      parallel.Collection.from_sharded(applications_db),
      mapper=MergeAllMapper(applications_db, products_db, applications_docs_db, submissions_db,
                            submissions_property_type_db, marketing_status, te_db),
      reducer=parallel.IdentityReducer(),
      output_prefix=self.output().path,
      map_workers=1,
      num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
Exemplo n.º 15
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Exemplo n.º 16
0
    def run(self):
        import sys
        import csv
        maxInt = sys.maxsize
        decrement = True

        while decrement:
            # decrease the maxInt value by factor 10
            # as long as the OverflowError occurs.

            decrement = False
            try:
                csv.field_size_limit(maxInt)
            except OverflowError:
                maxInt = int(maxInt / 10)
                decrement = True
        parallel.mapreduce(parallel.Collection.from_glob(
            self.input().path, parallel.CSVDictLineInput(delimiter='\t')),
                           mapper=NDC2JSONMapper(),
                           reducer=parallel.IdentityReducer(),
                           output_prefix=self.output().path,
                           num_shards=1)
Exemplo n.º 17
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.CSVDictLineInput()),
                        mapper=NSDE2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemplo n.º 18
0
 def run(self):
   mapreduce(Collection.from_sharded(self.input().path),
     mapper=Harmonized2OpenFDAMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path,
     num_shards=1)
Exemplo n.º 19
0
 def run(self):
   parallel.mapreduce(
     parallel.Collection.from_glob(self.batch, parallel.LineInput),
     mapper=SPL2JSONMapper(),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Exemplo n.º 20
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.JSONLineInputUnicode()),
                        mapper=SubstanceData2JSONMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)
Exemplo n.º 21
0
 def run(self):
   parallel.mapreduce(
     parallel.Collection.from_sharded(self.input()[1].path),
     mapper=UpcMapper(spl_s3_dir=SPL_S3_DIR),
     reducer=parallel.IdentityReducer(),
     output_prefix=self.output().path)
Exemplo n.º 22
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_sharded(self.input().path),
                        mapper=CurrentSPLMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path)