def test_data_hasher_from_dict(self): sha1 = FileStatter.sha1({ 'type': 'array', 'items': { 'type': 'object', 'properties': { 'column1': {'type': 'string'}, 'column2': {'type': 'string'} }, 'required': ['column1', 'column2'] } }) self.assertEqual(sha1, 'a471c364d74034ddc779d3498301a3c6adf871ed')
def run(file_path): # Init logging and database init_logging() client, file_col, schema_col, source_data_col = init_mongodb(config) # Set up counters and file index successfully_ingested_files = 0 file_counter = 0 file_list = DirLister.get_file_list_recursive(file_path) logging.info('Processing %d files from %s' % (len(file_list), file_path)) for file in file_list: file_counter += 1 ProgressBar.update_progress(file_counter / len(file_list), ('Processing file %s' % file)) # get the file stats document = { 'stats': FileStatter.stats(file), 'filePath': file, '_id': file, 'hash': FileStatter.sha1_from_file(file) } # Load the data or skip if unable if file.lower().endswith('.mif'): try: data = MIFparser.to_dict(file) except ValueError as e: logging.error(e) # if the data loading doesn't work out, just log the error and skip the file continue elif file.lower().endswith('.mid'): logging.debug('Skipping .mid file.') continue # .mid files are processed along with its 'parented' .mif file else: try: data = CSVparser.to_dict(file) except ValueError as e: logging.error('CSV parsing error on file %s: %s' % (file, e)) # if the data loading doesn't work out, just log the error and skip the file continue # Generate the schema and try to ingest it try: schema_data = SchemaGenerator.generate_schema(data) except Exception as e: logging.error('Schema error on file %s: %s' % (file, e)) continue schema_hash = FileStatter.sha1(schema_data) schema = { '_id': schema_hash, 'schema': schema_data, } try: schema_col.insert_one(schema) except DuplicateKeyError: logging.debug('Schema %s was previously processed' % schema_hash) except Exception as e: logging.error('Ingest schema error on file %s: %s' % (file, e)) # if the schema loading doesn't work out, just log the error and skip the file continue # Store the source data source_data_doc_sha1 = FileStatter.sha1(data) source_data_doc = {'_id': source_data_doc_sha1, 'data': data} try: source_data_col.insert_one(document=source_data_doc) except DuplicateKeyError: logging.debug('Sourcedata with sha1 %s was previously processed' % source_data_doc_sha1) except Exception as e: logging.error('Ingest source data error on file %s: %s' % (file, e)) continue # Finalize the file document with the data reference and the schema reference document['data'] = source_data_doc_sha1 document['schema'] = schema['_id'] try: file_col.insert_one(document=document) except DuplicateKeyError: logging.warning('File %s was previously processed, skipping' % file) # Skip to next file continue except Exception as e: logging.error('Ingest file metadata error on file %s: %s' % (file, e)) continue logging.debug('File %s was successfully ingested' % file) successfully_ingested_files += 1 logging.info('Finished!') logging.info('Successfully ingested %d files of %d' % (successfully_ingested_files, len(file_list))) client.close()
def test_data_hasher_from_list(self): sha1 = FileStatter.sha1([{'data': 'some data'}]) self.assertEqual(sha1, '297c88ed1e2052e7fac31426fe6b85502ad4c717')
def test_data_hasher_from_schema_dict(self): dictionary = CSVparser.to_dict(current_dir + '/mockups/schema/caseInsensitiveTest/test.csv') sha1 = FileStatter.sha1(SchemaGenerator.generate_schema(dictionary)) self.assertEqual(sha1, 'a59a9b5c48657c3828c4c308cd057997aa7927fb')
def test_data_hasher_from_string(self): sha1 = FileStatter.sha1('this is a string') self.assertEqual(sha1, '517592df8fec3ad146a79a9af153db2a4d784ec5')
def test_data_hasher_from_read_file(self): with open(testfile, mode='rb') as test_data: sha1 = FileStatter.sha1(test_data) self.assertEqual(sha1, 'df4b5e0bf4df6bb62fcb659015885859cf3f1b63')
def test_file_hasher(self): sha1 = FileStatter.sha1_from_file(testfile) self.assertEqual(sha1, 'df4b5e0bf4df6bb62fcb659015885859cf3f1b63')