예제 #1
0
 def test_data_hasher_from_dict(self):
     sha1 = FileStatter.sha1({
         'type': 'array',
         'items': {
             'type': 'object',
             'properties': {
                 'column1': {'type': 'string'},
                 'column2': {'type': 'string'}
             },
             'required': ['column1', 'column2']
         }
     })
     self.assertEqual(sha1, 'a471c364d74034ddc779d3498301a3c6adf871ed')
예제 #2
0
def run(file_path):
    # Init logging and database
    init_logging()
    client, file_col, schema_col, source_data_col = init_mongodb(config)

    # Set up counters and file index
    successfully_ingested_files = 0
    file_counter = 0
    file_list = DirLister.get_file_list_recursive(file_path)

    logging.info('Processing %d files from %s' % (len(file_list), file_path))

    for file in file_list:
        file_counter += 1
        ProgressBar.update_progress(file_counter / len(file_list),
                                    ('Processing file %s' % file))

        # get the file stats
        document = {
            'stats': FileStatter.stats(file),
            'filePath': file,
            '_id': file,
            'hash': FileStatter.sha1_from_file(file)
        }

        # Load the data or skip if unable
        if file.lower().endswith('.mif'):
            try:
                data = MIFparser.to_dict(file)
            except ValueError as e:
                logging.error(e)
                # if the data loading doesn't work out, just log the error and skip the file
                continue
        elif file.lower().endswith('.mid'):
            logging.debug('Skipping .mid file.')
            continue  # .mid files are processed along with its 'parented' .mif file
        else:
            try:
                data = CSVparser.to_dict(file)
            except ValueError as e:
                logging.error('CSV parsing error on file %s: %s' % (file, e))
                # if the data loading doesn't work out, just log the error and skip the file
                continue

        # Generate the schema and try to ingest it
        try:
            schema_data = SchemaGenerator.generate_schema(data)
        except Exception as e:
            logging.error('Schema error on file %s: %s' % (file, e))
            continue

        schema_hash = FileStatter.sha1(schema_data)
        schema = {
            '_id': schema_hash,
            'schema': schema_data,
        }

        try:
            schema_col.insert_one(schema)
        except DuplicateKeyError:
            logging.debug('Schema %s was previously processed' % schema_hash)
        except Exception as e:
            logging.error('Ingest schema error on file %s: %s' % (file, e))
            # if the schema loading doesn't work out, just log the error and skip the file
            continue

        # Store the source data
        source_data_doc_sha1 = FileStatter.sha1(data)
        source_data_doc = {'_id': source_data_doc_sha1, 'data': data}

        try:
            source_data_col.insert_one(document=source_data_doc)
        except DuplicateKeyError:
            logging.debug('Sourcedata with sha1 %s was previously processed' %
                          source_data_doc_sha1)
        except Exception as e:
            logging.error('Ingest source data error on file %s: %s' %
                          (file, e))
            continue

        # Finalize the file document with the data reference and the schema reference
        document['data'] = source_data_doc_sha1
        document['schema'] = schema['_id']

        try:
            file_col.insert_one(document=document)
        except DuplicateKeyError:
            logging.warning('File %s was previously processed, skipping' %
                            file)
            # Skip to next file
            continue
        except Exception as e:
            logging.error('Ingest file metadata error on file %s: %s' %
                          (file, e))
            continue

        logging.debug('File %s was successfully ingested' % file)
        successfully_ingested_files += 1

    logging.info('Finished!')
    logging.info('Successfully ingested %d files of %d' %
                 (successfully_ingested_files, len(file_list)))
    client.close()
예제 #3
0
 def test_data_hasher_from_list(self):
     sha1 = FileStatter.sha1([{'data': 'some data'}])
     self.assertEqual(sha1, '297c88ed1e2052e7fac31426fe6b85502ad4c717')
예제 #4
0
 def test_data_hasher_from_schema_dict(self):
     dictionary = CSVparser.to_dict(current_dir + '/mockups/schema/caseInsensitiveTest/test.csv')
     sha1 = FileStatter.sha1(SchemaGenerator.generate_schema(dictionary))
     self.assertEqual(sha1, 'a59a9b5c48657c3828c4c308cd057997aa7927fb')
예제 #5
0
 def test_data_hasher_from_string(self):
     sha1 = FileStatter.sha1('this is a string')
     self.assertEqual(sha1, '517592df8fec3ad146a79a9af153db2a4d784ec5')
예제 #6
0
 def test_data_hasher_from_read_file(self):
     with open(testfile, mode='rb') as test_data:
         sha1 = FileStatter.sha1(test_data)
         self.assertEqual(sha1, 'df4b5e0bf4df6bb62fcb659015885859cf3f1b63')
예제 #7
0
 def test_file_hasher(self):
     sha1 = FileStatter.sha1_from_file(testfile)
     self.assertEqual(sha1, 'df4b5e0bf4df6bb62fcb659015885859cf3f1b63')