def mr_schema_gen(self): hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (HDFS_PATH, self.collection_name) # delete folders execute("hadoop fs -rm -r -f %s" % hdfs_data_folder) execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder) # copy extracted files to hdfs data folder execute("hadoop fs -mkdir -p %s" % hdfs_data_folder) for extract_file_name in self.extract_file_names: execute("hadoop fs -copyFromLocal %s %s/" % (extract_file_name, hdfs_data_folder)) hadoop_command = """hadoop jar %s \ -D mapred.job.name="onefold-mongo-generate-schema" \ %s \ -input %s -output %s \ -mapper 'json/generate-schema-mapper.py' \ -reducer 'json/generate-schema-reducer.py %s/%s/%s' \ -file json/generate-schema-mapper.py \ -file json/generate-schema-reducer.py """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def mr_schema_gen(self): hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/schema_gen/output" % ( HDFS_PATH, self.collection_name) # delete folders execute("hadoop fs -rm -r -f %s" % hdfs_data_folder) execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder) # copy extracted files to hdfs data folder execute("hadoop fs -mkdir -p %s" % hdfs_data_folder) for extract_file_name in self.extract_file_names: execute("hadoop fs -copyFromLocal %s %s/" % (extract_file_name, hdfs_data_folder)) hadoop_command = """hadoop jar %s \ -D mapred.job.name="onefold-mongo-generate-schema" \ %s \ -input %s -output %s \ -mapper 'json/generate-schema-mapper.py' \ -reducer 'json/generate-schema-reducer.py %s/%s/%s' \ -file json/generate-schema-mapper.py \ -file json/generate-schema-reducer.py """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def mr_schema_gen(self): hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/schema_gen/output" % (CLOUD_STORAGE_PATH, self.collection_name) # delete folders self.cs.rmdir(hdfs_data_folder) self.cs.rmdir(hdfs_mr_output_folder) # copy extracted files to hdfs data folder self.cs.mkdir(hdfs_data_folder) for extract_file_name in self.extract_file_names: self.cs.copy_from_local(extract_file_name, hdfs_data_folder) hadoop_command = """hadoop jar %s \ -D mapred.job.name="onefold-mongo-generate-schema" \ %s \ -input %s -output %s \ -mapper 'json/generate-schema-mapper.py' \ -reducer 'json/generate-schema-reducer.py %s/%s/%s' \ -file json/generate-schema-mapper.py \ -file json/generate-schema-reducer.py """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def mr_schema_gen(self): hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/schema_gen/output" % ( CLOUD_STORAGE_PATH, self.collection_name) # delete folders self.cs.rmdir(hdfs_data_folder) self.cs.rmdir(hdfs_mr_output_folder) # copy extracted files to hdfs data folder self.cs.mkdir(hdfs_data_folder) for extract_file_name in self.extract_file_names: self.cs.copy_from_local(extract_file_name, hdfs_data_folder) hadoop_command = """hadoop jar %s \ -D mapred.job.name="onefold-mongo-generate-schema" \ %s \ -input %s -output %s \ -mapper 'json/generate-schema-mapper.py' \ -reducer 'json/generate-schema-reducer.py %s/%s/%s' \ -file json/generate-schema-mapper.py \ -file json/generate-schema-reducer.py """ % (HADOOP_MAPREDUCE_STREAMING_LIB, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def delete_table(self, database_name, table_name): command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, table_name) execute(command, ignore_error=True) child_table_names = self.list_tables(database_name, table_name) for child_table_name in child_table_names: command = "bq --project_id %s rm -f %s.%s" % (self.project_id, database_name, child_table_name) execute(command, ignore_error=True)
def rmdir(self, path): print 'rmdir: %s' % (path) if not path.endswith("/"): path = path + "/" command = "gsutil -m rm -rf gs://%s/%s" % (self.bucket_id, path) execute(command, ignore_error=True)
def create_table(self, database_name, table_name, schema_fields, process_array = "child_table"): table_columns = {} for field in schema_fields: data_type = field['data_type'] # ignore record if field['data_type'] in ('record'): continue if data_type is not None: if field['mode'] == 'repeated': if process_array == "child_table": child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key']).lower() column_name = "value" else: continue else: if "." in field['key']: if process_array == "child_table": child_table_name = table_name + "_" + re.sub("[^0-9a-zA-Z_]", '_', field['key'].rsplit(".",1)[0]).lower() column_name = field['key'].rsplit(".",1)[1] print " Child Table column:" + column_name else: child_table_name = table_name column_name = field['key'].split(".",1)[0] data_type = "string" print " Inline column:" + column_name else: child_table_name = table_name column_name = field['key'] if child_table_name not in table_columns: table_columns[child_table_name] = [] if child_table_name != table_name: table_columns[child_table_name].append({"name": "parent_hash_code", "type": "string", "mode": "nullable"}) table_columns[child_table_name].append({"name": "hash_code", "type": "string", "mode": "nullable"}) table_columns[child_table_name].append({"name": column_name, "type": data_type, "mode": "nullable"}) for table_name, columns in table_columns.iteritems(): # create schema file schema_file_name = table_name + "_schema.json" schema_json = json.dumps(columns) schema_file = open(schema_file_name, "w") schema_file.write(schema_json) schema_file.close() # execute create-table command command = "bq --project_id %s mk --schema %s %s.%s" % (self.project_id, schema_file_name, database_name, table_name) execute(command) return table_columns.keys()
def delete_table(self, database_name, table_name): command = "bq --project_id %s rm -f %s.%s" % ( self.project_id, database_name, table_name) execute(command, ignore_error=True) child_table_names = self.list_tables(database_name, table_name) for child_table_name in child_table_names: command = "bq --project_id %s rm -f %s.%s" % ( self.project_id, database_name, child_table_name) execute(command, ignore_error=True)
def copy_from_local(self, source_local_file_path, dest_path): print 'copy_from_local: %s %s' % (source_local_file_path, dest_path) if not dest_path.endswith("/"): dest_path = dest_path + "/" dest_path = dest_path + source_local_file_path.split("/")[-1] command = "gsutil -m cp %s gs://%s/%s" % (source_local_file_path, self.bucket_id, dest_path) execute(command, ignore_error=False, retry=True)
def simple_data_transform(self): hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name) transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name) command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \ % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name, transform_data_tmp_path) execute(command) # delete folders self.cs.rmdir (hdfs_mr_output_folder) # manually copy files into hdfs fragment_values = self.get_fragments() for fragment_value in fragment_values: self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value)) self.cs.copy_from_local("%s/%s/part-00000" % (transform_data_tmp_path, fragment_value), "%s/%s/" % (hdfs_mr_output_folder, fragment_value))
def mr_data_transform(self): hdfs_data_folder = "%s/%s/data" % (CLOUD_STORAGE_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/data_transform/output" % (CLOUD_STORAGE_PATH, self.collection_name) # delete folders self.cs.rmdir(hdfs_mr_output_folder) hadoop_command = """hadoop jar %s \ -libjars %s \ -D mapred.job.name="onefold-mongo-transform-data" \ -D mapred.reduce.tasks=0 \ %s \ -input %s -output %s \ -mapper 'json/transform-data-mapper.py %s/%s/%s' \ -file json/transform-data-mapper.py \ -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def mr_data_transform(self): hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/data_transform/output" % (HDFS_PATH, self.collection_name) # delete folders execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder) hadoop_command = """hadoop jar /usr/hdp/2.2.0.0-2041/hadoop-mapreduce/hadoop-streaming.jar \ -libjars java/MapReduce/target/MapReduce-0.0.1-SNAPSHOT.jar \ -D mapred.job.name="onefold-mongo-transform-data" \ -D mapred.reduce.tasks=0 \ %s \ -input %s -output %s \ -mapper 'json/transform-data-mapper.py %s/%s/%s' \ -file json/transform-data-mapper.py \ -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat """ % (MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def simple_data_transform(self): hdfs_mr_output_folder = "%s/%s/data_transform/output" % ( CLOUD_STORAGE_PATH, self.collection_name) transform_data_tmp_path = "%s/%s/data_transform/output" % ( self.tmp_path, self.collection_name) command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \ % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name, transform_data_tmp_path) execute(command) # delete folders self.cs.rmdir(hdfs_mr_output_folder) # manually copy files into hdfs fragment_values = self.get_fragments() for fragment_value in fragment_values: self.cs.mkdir("%s/%s" % (hdfs_mr_output_folder, fragment_value)) self.cs.copy_from_local( "%s/%s/part-00000" % (transform_data_tmp_path, fragment_value), "%s/%s/" % (hdfs_mr_output_folder, fragment_value))
def mr_data_transform(self): hdfs_data_folder = "%s/%s/data" % (HDFS_PATH, self.collection_name) hdfs_mr_output_folder = "%s/%s/data_transform/output" % ( HDFS_PATH, self.collection_name) # delete folders execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder) hadoop_command = """hadoop jar %s \ -libjars %s \ -D mapred.job.name="onefold-mongo-transform-data" \ -D mapred.reduce.tasks=0 \ %s \ -input %s -output %s \ -mapper 'json/transform-data-mapper.py %s/%s/%s' \ -file json/transform-data-mapper.py \ -outputformat com.onefold.hadoop.MapReduce.TransformDataMultiOutputFormat """ % (HADOOP_MAPREDUCE_STREAMING_LIB, ONEFOLD_MAPREDUCE_JAR, MAPREDUCE_PARAMS_STR, hdfs_data_folder, hdfs_mr_output_folder, self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(hadoop_command)
def simple_data_transform(self): hdfs_mr_output_folder = "%s/%s/data_transform/output" % (HDFS_PATH, self.collection_name) transform_data_tmp_path = "%s/%s/data_transform/output" % (self.tmp_path, self.collection_name) command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \ % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name, transform_data_tmp_path) execute(command) # delete folders execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder) # manually copy files into hdfs fragment_values = self.get_fragments() for fragment_value in fragment_values: execute("hadoop fs -mkdir -p %s/%s" % (hdfs_mr_output_folder, fragment_value), ignore_error=True) execute("hadoop fs -copyFromLocal %s/%s/part-00000 %s/%s/" % (transform_data_tmp_path, fragment_value, hdfs_mr_output_folder, fragment_value))
def simple_data_transform(self): hdfs_mr_output_folder = "%s/%s/data_transform/output" % ( HDFS_PATH, self.collection_name) transform_data_tmp_path = "%s/%s/data_transform/output" % ( self.tmp_path, self.collection_name) command = "cat %s | json/transform-data-mapper.py %s/%s/%s,%s > /dev/null" \ % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name, transform_data_tmp_path) execute(command) # delete folders execute("hadoop fs -rm -r -f %s" % hdfs_mr_output_folder) # manually copy files into hdfs fragment_values = self.get_fragments() for fragment_value in fragment_values: execute("hadoop fs -mkdir -p %s/%s" % (hdfs_mr_output_folder, fragment_value), ignore_error=True) execute("hadoop fs -copyFromLocal %s/%s/part-00000 %s/%s/" % (transform_data_tmp_path, fragment_value, hdfs_mr_output_folder, fragment_value))
def simple_schema_gen(self): command = "cat %s | json/generate-schema-mapper.py | sort | json/generate-schema-reducer.py %s/%s/%s > /dev/null" \ % (' '.join(self.extract_file_names), self.mongo_uri, self.schema_db_name, self.schema_collection_name) execute(command)
def create_dataset(self, database_name): command = "bq --project_id %s mk %s" % (self.project_id, database_name) execute(command, ignore_error=True)
def copy_from_local(self, source_local_file_path, dest_path): execute("hadoop fs -copyFromLocal %s %s/" % (source_local_file_path, dest_path))
def mkdir(self, path): execute("hadoop fs -mkdir -p %s" % path, ignore_error=True)
def rmdir(self, path): execute("hadoop fs -rm -r -f %s" % path, ignore_error=True)
def load_table(self, database_name, table_name, file_path): command = "bq --project_id %s --nosync load --source_format NEWLINE_DELIMITED_JSON %s.%s gs://%s/%s*" % \ (self.project_id, database_name, table_name, self.bucket_id, file_path) execute(command)
def create_table(self, database_name, table_name, schema_fields, process_array="child_table"): table_columns = {} for field in schema_fields: data_type = field['data_type'] # ignore record if field['data_type'] in ('record'): continue if data_type is not None: if field['mode'] == 'repeated': if process_array == "child_table": child_table_name = table_name + "_" + re.sub( "[^0-9a-zA-Z_]", '_', field['key']).lower() column_name = "value" else: continue else: if "." in field['key']: if process_array == "child_table": child_table_name = table_name + "_" + re.sub( "[^0-9a-zA-Z_]", '_', field['key'].rsplit( ".", 1)[0]).lower() column_name = field['key'].rsplit(".", 1)[1] print " Child Table column:" + column_name else: child_table_name = table_name column_name = field['key'].split(".", 1)[0] data_type = "string" print " Inline column:" + column_name else: child_table_name = table_name column_name = field['key'] if child_table_name not in table_columns: table_columns[child_table_name] = [] if child_table_name != table_name: table_columns[child_table_name].append({ "name": "parent_hash_code", "type": "string", "mode": "nullable" }) table_columns[child_table_name].append({ "name": "hash_code", "type": "string", "mode": "nullable" }) table_columns[child_table_name].append({ "name": column_name, "type": data_type, "mode": "nullable" }) for table_name, columns in table_columns.iteritems(): # create schema file schema_file_name = table_name + "_schema.json" schema_json = json.dumps(columns) schema_file = open(schema_file_name, "w") schema_file.write(schema_json) schema_file.close() # execute create-table command command = "bq --project_id %s mk --schema %s %s.%s" % ( self.project_id, schema_file_name, database_name, table_name) execute(command) return table_columns.keys()