Пример #1
0
  def mr_schema_gen(self):

    hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name)
    hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (HDFS_PATH, self.collection_name)

    # delete folders
    execute("hadoop fs -rm -r -f %s" % hdfs_data_folder)
    execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder)

    # copy extracted files to hdfs data folder
    execute("hadoop fs -mkdir -p %s" % hdfs_data_folder)
    for extract_file_name in self.extract_file_names:
      execute("hadoop fs -copyFromLocal %s %s/" % (extract_file_name, hdfs_data_folder))

    hadoop_command = """hadoop jar %s \
                              -D mapred.job.name="onefold-mongo-generate-schema" \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/generate-schema-mapper.py' \
                              -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
                              -file json/generate-schema-mapper.py \
                              -file json/generate-schema-reducer.py
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder,
           hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
    execute(hadoop_command)
Пример #2
0
    def mr_schema_gen(self):

        hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name)
        hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (
            HDFS_PATH, self.collection_name)

        # delete folders
        execute("hadoop fs -rm -r -f %s" % hdfs_data_folder)
        execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder)

        # copy extracted files to hdfs data folder
        execute("hadoop fs -mkdir -p %s" % hdfs_data_folder)
        for extract_file_name in self.extract_file_names:
            execute("hadoop fs -copyFromLocal %s %s/" %
                    (extract_file_name, hdfs_data_folder))

        hadoop_command = """hadoop jar %s \
                              -D mapred.job.name="onefold-mongo-generate-schema" \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/generate-schema-mapper.py' \
                              -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
                              -file json/generate-schema-mapper.py \
                              -file json/generate-schema-reducer.py
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR,
           hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
        execute(hadoop_command)
Пример #3
0
  def mr_schema_gen(self):

    hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name)
    hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (CLOUD_STORAGE_PATH, self.collection_name)

    # delete folders
    self.cs.rmdir(hdfs_data_folder)
    self.cs.rmdir(hdfs_mr_output_folder)
    

    # copy extracted files to hdfs data folder
    self.cs.mkdir(hdfs_data_folder)

    for extract_file_name in self.extract_file_names:
      self.cs.copy_from_local(extract_file_name, hdfs_data_folder)

    hadoop_command = """hadoop jar %s \
                              -D mapred.job.name="onefold-mongo-generate-schema" \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/generate-schema-mapper.py' \
                              -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
                              -file json/generate-schema-mapper.py \
                              -file json/generate-schema-reducer.py
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder,
           hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
    execute(hadoop_command)
Пример #4
0
    def mr_schema_gen(self):

        hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH,
                                           self.collection_name)
        hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (
            CLOUD_STORAGE_PATH, self.collection_name)

        # delete folders
        self.cs.rmdir(hdfs_data_folder)
        self.cs.rmdir(hdfs_mr_output_folder)

        # copy extracted files to hdfs data folder
        self.cs.mkdir(hdfs_data_folder)

        for extract_file_name in self.extract_file_names:
            self.cs.copy_from_local(extract_file_name, hdfs_data_folder)

        hadoop_command = """hadoop jar %s \
                              -D mapred.job.name="onefold-mongo-generate-schema" \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/generate-schema-mapper.py' \
                              -reducer 'json/generate-schema-reducer.py %s/%s/%s' \
                              -file json/generate-schema-mapper.py \
                              -file json/generate-schema-reducer.py
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR,
           hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
        execute(hadoop_command)
Пример #5
0
 def delete_table(self, database_name, table_name):
   command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, table_name)
   execute(command, ignore_error=True)
     
   child_table_names = self.list_tables(database_name, table_name)
   for child_table_name in child_table_names:
     command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, child_table_name)
     execute(command, ignore_error=True)
Пример #6
0
    def rmdir(self, path):
        
        print 'rmdir: %s' % (path)

        if not path.endswith("/"):
            path = path + "/"
        
        command = "gsutil -m rm -rf gs://%s/%s" % (self.bucket_id, path)
        execute(command, ignore_error=True)
Пример #7
0
    def rmdir(self, path):

        print 'rmdir: %s' % (path)

        if not path.endswith("/"):
            path = path + "/"

        command = "gsutil -m rm -rf gs://%s/%s" % (self.bucket_id, path)
        execute(command, ignore_error=True)
Пример #8
0
  def create_table(self, database_name, table_name, schema_fields, process_array = "child_table"):
    
    table_columns = {}

    for field in schema_fields:
      data_type = field['data_type']

      # ignore record
      if field['data_type'] in ('record'):
        continue

      if data_type is not None:
        if field['mode'] == 'repeated':
          if process_array == "child_table":
            child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower()
            column_name = "value"
          else:
            continue
        else:
          if "." in field['key']:
            if process_array == "child_table":
              child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower()
              column_name = field['key'].rsplit(".",1)[1]
              print "  Child Table column:" + column_name
            else:
              child_table_name = table_name
              column_name = field['key'].split(".",1)[0]
              data_type = "string"
              print "  Inline column:" + column_name
          else:
            child_table_name = table_name
            column_name = field['key']

        if child_table_name not in table_columns:
          table_columns[child_table_name] = []
          if child_table_name != table_name:
            table_columns[child_table_name].append({"name": "parent_hash_code", "type": "string", "mode": "nullable"})
            table_columns[child_table_name].append({"name": "hash_code", "type": "string", "mode": "nullable"})

        table_columns[child_table_name].append({"name": column_name, "type": data_type, "mode": "nullable"})

    for table_name, columns in table_columns.iteritems():

      # create schema file
      schema_file_name = table_name + "_schema.json"
      schema_json = json.dumps(columns)
      schema_file = open(schema_file_name, "w")
      schema_file.write(schema_json)
      schema_file.close()

      # execute create-table command
      command = "bq --project_id %s mk --schema %s %s.%s" % (self.project_id, schema_file_name,
                                                             database_name, table_name)
      execute(command)

    return table_columns.keys()
Пример #9
0
    def delete_table(self, database_name, table_name):
        command = "bq --project_id %s rm -f %s.%s" % (
            self.project_id, database_name, table_name)
        execute(command, ignore_error=True)

        child_table_names = self.list_tables(database_name, table_name)
        for child_table_name in child_table_names:
            command = "bq --project_id %s rm -f %s.%s" % (
                self.project_id, database_name, child_table_name)
            execute(command, ignore_error=True)
Пример #10
0
    def copy_from_local(self, source_local_file_path, dest_path):

        print 'copy_from_local: %s %s' % (source_local_file_path, dest_path)
        
        if not dest_path.endswith("/"):
            dest_path = dest_path + "/"
            
        dest_path = dest_path + source_local_file_path.split("/")[-1]
        
        command = "gsutil -m cp %s gs://%s/%s" % (source_local_file_path, self.bucket_id, dest_path)
        execute(command, ignore_error=False, retry=True)
Пример #11
0
    def copy_from_local(self, source_local_file_path, dest_path):

        print 'copy_from_local: %s %s' % (source_local_file_path, dest_path)

        if not dest_path.endswith("/"):
            dest_path = dest_path + "/"

        dest_path = dest_path + source_local_file_path.split("/")[-1]

        command = "gsutil -m cp %s gs://%s/%s" % (source_local_file_path,
                                                  self.bucket_id, dest_path)
        execute(command, ignore_error=False, retry=True)
Пример #12
0
  def simple_data_transform(self):

    hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name)
    transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name)

    command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
              % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
                 self.schema_collection_name, transform_data_tmp_path)
    execute(command)

    # delete folders
    self.cs.rmdir (hdfs_mr_output_folder)

    # manually copy files into hdfs
    fragment_values = self.get_fragments()
    for fragment_value in fragment_values:
      self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value))
      self.cs.copy_from_local("%s/%s/part-00000" % (transform_data_tmp_path, fragment_value),
                              "%s/%s/" % (hdfs_mr_output_folder, fragment_value))
Пример #13
0
  def mr_data_transform(self):

    hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name)
    hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name)

    # delete folders
    self.cs.rmdir(hdfs_mr_output_folder)

    hadoop_command = """hadoop jar %s \
                              -libjars %s \
                              -D mapred.job.name="onefold-mongo-transform-data" \
                              -D mapred.reduce.tasks=0 \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/transform-data-mapper.py %s/%s/%s' \
                              -file json/transform-data-mapper.py \
                              -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
    execute(hadoop_command)
Пример #14
0
  def mr_data_transform(self):

    hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name)
    hdfs_mr_output_folder = "%s/%s/data_transform/output" % (HDFS_PATH, self.collection_name)

    # delete folders
    execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder)


    hadoop_command = """hadoop jar /usr/hdp/2.2.0.0-2041/hadoop-mapreduce/hadoop-streaming.jar \
                              -libjars java/MapReduce/target/MapReduce-0.0.1-SNAPSHOT.jar \
                              -D mapred.job.name="onefold-mongo-transform-data" \
                              -D mapred.reduce.tasks=0 \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/transform-data-mapper.py %s/%s/%s' \
                              -file json/transform-data-mapper.py \
                              -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat
    """ % (MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri,
           self.schema_db_name, self.schema_collection_name)
    execute(hadoop_command)
Пример #15
0
    def simple_data_transform(self):

        hdfs_mr_output_folder = "%s/%s/data_transform/output" % (
            CLOUD_STORAGE_PATH, self.collection_name)
        transform_data_tmp_path = "%s/%s/data_transform/output" % (
            self.tmp_path, self.collection_name)

        command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
                  % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
                     self.schema_collection_name, transform_data_tmp_path)
        execute(command)

        # delete folders
        self.cs.rmdir(hdfs_mr_output_folder)

        # manually copy files into hdfs
        fragment_values = self.get_fragments()
        for fragment_value in fragment_values:
            self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value))
            self.cs.copy_from_local(
                "%s/%s/part-00000" % (transform_data_tmp_path, fragment_value),
                "%s/%s/" % (hdfs_mr_output_folder, fragment_value))
Пример #16
0
    def mr_data_transform(self):

        hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name)
        hdfs_mr_output_folder = "%s/%s/data_transform/output" % (
            HDFS_PATH, self.collection_name)

        # delete folders
        execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder)

        hadoop_command = """hadoop jar %s \
                              -libjars %s \
                              -D mapred.job.name="onefold-mongo-transform-data" \
                              -D mapred.reduce.tasks=0 \
                              %s \
                              -input %s -output %s \
                              -mapper 'json/transform-data-mapper.py %s/%s/%s' \
                              -file json/transform-data-mapper.py \
                              -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat
    """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR,
           MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder,
           self.mongo_uri, self.schema_db_name, self.schema_collection_name)
        execute(hadoop_command)
Пример #17
0
  def simple_data_transform(self):

    hdfs_mr_output_folder = "%s/%s/data_transform/output" % (HDFS_PATH, self.collection_name)
    transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name)

    command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
              % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
                 self.schema_collection_name, transform_data_tmp_path)
    execute(command)

    # delete folders
    execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder)

    # manually copy files into hdfs
    fragment_values = self.get_fragments()
    for fragment_value in fragment_values:
      execute("hadoop fs -mkdir -p %s/%s" % (hdfs_mr_output_folder, fragment_value), ignore_error=True)
      execute("hadoop fs -copyFromLocal %s/%s/part-00000 %s/%s/" % (transform_data_tmp_path, fragment_value,
                                                                    hdfs_mr_output_folder, fragment_value))
Пример #18
0
    def simple_data_transform(self):

        hdfs_mr_output_folder = "%s/%s/data_transform/output" % (
            HDFS_PATH, self.collection_name)
        transform_data_tmp_path = "%s/%s/data_transform/output" % (
            self.tmp_path, self.collection_name)

        command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \
                  % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name,
                     self.schema_collection_name, transform_data_tmp_path)
        execute(command)

        # delete folders
        execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder)

        # manually copy files into hdfs
        fragment_values = self.get_fragments()
        for fragment_value in fragment_values:
            execute("hadoop fs -mkdir -p %s/%s" %
                    (hdfs_mr_output_folder, fragment_value),
                    ignore_error=True)
            execute("hadoop fs -copyFromLocal %s/%s/part-00000 %s/%s/" %
                    (transform_data_tmp_path, fragment_value,
                     hdfs_mr_output_folder, fragment_value))
Пример #19
0
 def simple_schema_gen(self):
     command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \
               % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name)
     execute(command)
Пример #20
0
 def create_dataset(self, database_name):
   command = "bq --project_id %s mk %s" % (self.project_id, database_name)
   execute(command, ignore_error=True)
Пример #21
0
 def copy_from_local(self, source_local_file_path, dest_path):
     execute("hadoop fs -copyFromLocal %s %s/" %
             (source_local_file_path, dest_path))
Пример #22
0
 def mkdir(self, path):
     execute("hadoop fs -mkdir -p %s" % path, ignore_error=True)
Пример #23
0
 def rmdir(self, path):
     execute("hadoop fs -rm -r -f %s" % path, ignore_error=True)
Пример #24
0
 def load_table(self, database_name, table_name, file_path):
     command = "bq --project_id %s --nosync load --source_format NEWLINE_DELIMITED_JSON %s.%s gs://%s/%s*" % \
                   (self.project_id, database_name, table_name, self.bucket_id, file_path)
     execute(command)
Пример #25
0
 def load_table(self, database_name, table_name, file_path):
   command = "bq --project_id %s --nosync load --source_format NEWLINE_DELIMITED_JSON %s.%s gs://%s/%s*" % \
                 (self.project_id, database_name, table_name, self.bucket_id, file_path)
   execute(command)
Пример #26
0
    def create_table(self,
                     database_name,
                     table_name,
                     schema_fields,
                     process_array="child_table"):

        table_columns = {}

        for field in schema_fields:
            data_type = field['data_type']

            # ignore record
            if field['data_type'] in ('record'):
                continue

            if data_type is not None:
                if field['mode'] == 'repeated':
                    if process_array == "child_table":
                        child_table_name = table_name + "_" + re.sub(
                            "[^0-9a-zA-Z_]", '_', field['key']).lower()
                        column_name = "value"
                    else:
                        continue
                else:
                    if "." in field['key']:
                        if process_array == "child_table":
                            child_table_name = table_name + "_" + re.sub(
                                "[^0-9a-zA-Z_]", '_', field['key'].rsplit(
                                    ".", 1)[0]).lower()
                            column_name = field['key'].rsplit(".", 1)[1]
                            print "  Child Table column:" + column_name
                        else:
                            child_table_name = table_name
                            column_name = field['key'].split(".", 1)[0]
                            data_type = "string"
                            print "  Inline column:" + column_name
                    else:
                        child_table_name = table_name
                        column_name = field['key']

                if child_table_name not in table_columns:
                    table_columns[child_table_name] = []
                    if child_table_name != table_name:
                        table_columns[child_table_name].append({
                            "name":
                            "parent_hash_code",
                            "type":
                            "string",
                            "mode":
                            "nullable"
                        })
                        table_columns[child_table_name].append({
                            "name":
                            "hash_code",
                            "type":
                            "string",
                            "mode":
                            "nullable"
                        })

                table_columns[child_table_name].append({
                    "name": column_name,
                    "type": data_type,
                    "mode": "nullable"
                })

        for table_name, columns in table_columns.iteritems():

            # create schema file
            schema_file_name = table_name + "_schema.json"
            schema_json = json.dumps(columns)
            schema_file = open(schema_file_name, "w")
            schema_file.write(schema_json)
            schema_file.close()

            # execute create-table command
            command = "bq --project_id %s mk --schema %s %s.%s" % (
                self.project_id, schema_file_name, database_name, table_name)
            execute(command)

        return table_columns.keys()
Пример #27
0
 def create_dataset(self, database_name):
     command = "bq --project_id %s mk %s" % (self.project_id, database_name)
     execute(command, ignore_error=True)
Пример #28
0
 def simple_schema_gen(self):
   command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \
             % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name)
   execute(command)
Пример #29
0
 def copy_from_local(self, source_local_file_path, dest_path):
     execute("hadoop fs -copyFromLocal %s %s/" % (source_local_file_path, dest_path))
Пример #30
0
 def mkdir(self, path):
     execute("hadoop fs -mkdir -p %s" % path, ignore_error=True)
Пример #31
0
 def rmdir(self, path):
     execute("hadoop fs -rm -r -f %s" % path, ignore_error=True)