예제 #1
0
 def setUp(self) -> None:
     config_file = os.path.join(self.resources_folder,
                                'submission_config.yml')
     load_config(config_file)
     # Need to set the directory so that the relative path set in the config file works from the top directory
     os.chdir(ROOT_DIR)
     self.validation = EloadValidation(2)
예제 #2
0
def main():
    argparse = ArgumentParser(description='Prepare to process backlog study and validate VCFs.')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
    argparse.add_argument('--force_config', action='store_true', default=False,
                          help='Overwrite the configuration file after backing it up.')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    preparation = EloadBacklog(args.eload)
    preparation.fill_in_config(args.force_config)
    preparation.report()

    validation = EloadValidation(args.eload)
    validation_tasks = ['assembly_check', 'vcf_check']
    validation.validate(validation_tasks)

    logger.info('Preparation complete, if files are valid please run ingestion as normal.')
 def setUp(self) -> None:
     config_file = os.path.join(self.resources_folder, 'submission_config.yml')
     load_config(config_file)
     # Need to set the directory so that the relative path set in the config file works from the top directory
     os.chdir(ROOT_DIR)
     self.validation = EloadValidation(2)
     # Used to restore test config after each test
     self.original_cfg = deepcopy(self.validation.eload_cfg.content)
예제 #4
0
def main():
    argparse = ArgumentParser(description='Validate an ELOAD by checking the data and metadata format and semantics.')
    argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
    argparse.add_argument('--validation_tasks', required=False, type=str, nargs='+',
                          default=EloadValidation.all_validation_tasks, choices=EloadValidation.all_validation_tasks,
                          help='task or set of tasks to perform during validation')
    argparse.add_argument('--set_as_valid', action='store_true', default=False,
                          help='Set the script to consider all validation tasks performed as valid in the final '
                               'evaluation. This does not affect the actual report but only change the final '
                               'evaluation')
    argparse.add_argument('--merge_per_analysis', action='store_true', default=False,
                          help='Whether to merge vcf files per analysis if possible.')
    argparse.add_argument('--report', action='store_true', default=False,
                          help='Set the script to only report the results based on previously run validation.')
    argparse.add_argument('--debug', action='store_true', default=False,
                          help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    eload = EloadValidation(args.eload)
    eload.upgrade_config_if_needed()
    if not args.report:
        eload.validate(args.validation_tasks, args.set_as_valid, args.merge_per_analysis)
    eload.report()
예제 #5
0
class TestEloadValidation(TestCase):
    resources_folder = os.path.join(ROOT_DIR, 'tests', 'resources')

    def setUp(self) -> None:
        config_file = os.path.join(self.resources_folder,
                                   'submission_config.yml')
        load_config(config_file)
        # Need to set the directory so that the relative path set in the config file works from the top directory
        os.chdir(ROOT_DIR)
        self.validation = EloadValidation(2)

    def test_parse_assembly_check_log_failed(self):
        assembly_check_log = os.path.join(self.resources_folder, 'validations',
                                          'failed_assembly_check.log')
        expected = ([
            " The assembly checking could not be completed: Contig '8' not found in assembly report"
        ], 1, 0, 0)
        assert self.validation.parse_assembly_check_log(
            assembly_check_log) == expected

    def test_parse_assembly_check_report_mismatch(self):
        mismatch_assembly_report = os.path.join(
            self.resources_folder, 'validations',
            'mismatch_text_assembly_report.txt')
        expected = ([
            "Line 15: Chromosome Chr14, position 7387, reference allele 'T' does not match the reference sequence, expected 'C'",
            "Line 18: Chromosome Chr14, position 8795, reference allele 'A' does not match the reference sequence, expected 'G'",
            "Line 19: Chromosome Chr14, position 8796, reference allele 'C' does not match the reference sequence, expected 'T'",
            "Line 20: Chromosome Chr14, position 9033, reference allele 'G' does not match the reference sequence, expected 'A'",
            "Line 22: Chromosome Chr14, position 9539, reference allele 'C' does not match the reference sequence, expected 'T'",
            "Line 24: Chromosome Chr14, position 9558, reference allele 'C' does not match the reference sequence, expected 'T'",
            "Line 38: Chromosome Chr14, position 10200, reference allele 'A' does not match the reference sequence, expected 'c'",
            "Line 49: Chromosome Chr14, position 10875, reference allele 'G' does not match the reference sequence, expected 'C'",
            "Line 54: Chromosome Chr14, position 11665, reference allele 'A' does not match the reference sequence, expected 'T'",
            "Line 55: Chromosome Chr14, position 11839, reference allele 'G' does not match the reference sequence, expected 'a'"
        ], 14)
        assert self.validation.parse_assembly_check_report(
            mismatch_assembly_report) == expected

    def test_parse_vcf_check_report(self):
        vcf_check_report = os.path.join(self.resources_folder, 'validations',
                                        'failed_file.vcf.errors.txt')

        valid, error_list, nb_error, nb_warning = self.validation.parse_vcf_check_report(
            vcf_check_report)
        assert valid is False
        assert len(error_list) == 8
        assert nb_error == 8
        assert nb_warning == 1

    def test_report(self):
        expected_report = '''Validation performed on 2020-11-01 10:37:54.755607
Metadata check: PASS
VCF check: PASS
Assembly check: PASS
Sample names check: PASS
----------------------------------

Metadata check:
  * /path/to/spreadsheet: PASS
    - number of error: 0
    - error messages: 

----------------------------------

VCF check:
  * test.vcf: PASS
    - number of error: 0
    - number of warning: 2
    - first 10 errors: 
    - see report for detail: /path/to/report

----------------------------------

Assembly check:
  * test.vcf: PASS
    - number of error: 0
    - match results: 20/20 (100.0%)
    - first 10 errors: 
    - first 10 mismatches: 
    - see report for detail: /path/to/report

----------------------------------

Sample names check:
  * a1: PASS
    - Samples that appear in the VCF but not in the Metadata sheet: 
    - Samples that appear in the Metadata sheet but not in the VCF file(s): 

----------------------------------
'''
        with patch('builtins.print') as mprint:
            self.validation.report()
        mprint.assert_called_once_with(expected_report)
class TestEloadValidation(TestCase):
    resources_folder = os.path.join(ROOT_DIR, 'tests', 'resources')

    def setUp(self) -> None:
        config_file = os.path.join(self.resources_folder, 'submission_config.yml')
        load_config(config_file)
        # Need to set the directory so that the relative path set in the config file works from the top directory
        os.chdir(ROOT_DIR)
        self.validation = EloadValidation(2)
        # Used to restore test config after each test
        self.original_cfg = deepcopy(self.validation.eload_cfg.content)

    def tearDown(self):
        self.validation.eload_cfg.content = self.original_cfg

    def test_parse_assembly_check_log_failed(self):
        assembly_check_log = os.path.join(self.resources_folder, 'validations', 'failed_assembly_check.log')
        expected = (
            [" The assembly checking could not be completed: Contig '8' not found in assembly report"],
            1,
            0,
            0
        )
        assert self.validation.parse_assembly_check_log(assembly_check_log) == expected

    def test_parse_assembly_check_report_mismatch(self):
        mismatch_assembly_report = os.path.join(self.resources_folder, 'validations', 'mismatch_text_assembly_report.txt')
        expected = (
            [
                "Line 15: Chromosome Chr14, position 7387, reference allele 'T' does not match the reference sequence, expected 'C'",
                "Line 18: Chromosome Chr14, position 8795, reference allele 'A' does not match the reference sequence, expected 'G'",
                "Line 19: Chromosome Chr14, position 8796, reference allele 'C' does not match the reference sequence, expected 'T'",
                "Line 20: Chromosome Chr14, position 9033, reference allele 'G' does not match the reference sequence, expected 'A'",
                "Line 22: Chromosome Chr14, position 9539, reference allele 'C' does not match the reference sequence, expected 'T'",
                "Line 24: Chromosome Chr14, position 9558, reference allele 'C' does not match the reference sequence, expected 'T'",
                "Line 38: Chromosome Chr14, position 10200, reference allele 'A' does not match the reference sequence, expected 'c'",
                "Line 49: Chromosome Chr14, position 10875, reference allele 'G' does not match the reference sequence, expected 'C'",
                "Line 54: Chromosome Chr14, position 11665, reference allele 'A' does not match the reference sequence, expected 'T'",
                "Line 55: Chromosome Chr14, position 11839, reference allele 'G' does not match the reference sequence, expected 'a'"
            ],
            14, [], 0
        )
        assert self.validation.parse_assembly_check_report(mismatch_assembly_report) == expected

    def test_parse_assembly_check_report_duplicate_synonym(self):
        mismatch_assembly_report = os.path.join(self.resources_folder, 'validations', 'multiple_synonyms_text_assembly_report.txt')
        expected = (
            [], 0,
            [
                "Line 3: Multiple synonyms  found for contig '1' in FASTA index file: CM000663.1 NC_000001.10",
                "Line 4: Multiple synonyms  found for contig 'X' in FASTA index file: CM000685.1 NC_000023.10"
            ],
            2
        )
        assert self.validation.parse_assembly_check_report(mismatch_assembly_report) == expected

    def test_parse_vcf_check_report(self):
        vcf_check_report = os.path.join(self.resources_folder, 'validations', 'failed_file.vcf.errors.txt')

        valid, error_list, nb_error, nb_warning = self.validation.parse_vcf_check_report(vcf_check_report)
        assert valid is False
        assert len(error_list) == 8
        assert nb_error == 8
        assert nb_warning == 1

    def test_report(self):
        expected_report = '''Validation performed on 2020-11-01 10:37:54.755607
Metadata check: PASS
VCF check: PASS
Assembly check: PASS
Sample names check: PASS
Aggregation check: PASS
----------------------------------

Metadata check:
  * /path/to/spreadsheet: PASS
    - number of error: 0
    - error messages: 

----------------------------------

VCF check:
  * test.vcf: PASS
    - number of error: 0
    - number of warning: 2
    - first 10 errors: 
    - see report for detail: /path/to/report

----------------------------------

Assembly check:
  * test.vcf: PASS
    - number of error: 0
    - match results: 20/20 (100.0%)
    - first 10 errors: 
    - first 10 mismatches: 
    - see report for detail: /path/to/report

----------------------------------

Sample names check:
  * a1: PASS
    - Samples that appear in the VCF but not in the Metadata sheet: 
    - Samples that appear in the Metadata sheet but not in the VCF file(s): 

----------------------------------

Aggregation:
  * a1: none
  * Errors:

----------------------------------

VCF merge:
  Merge types:
  * a1: horizontal

----------------------------------
'''
        with patch('builtins.print') as mprint:
            self.validation.report()
        mprint.assert_called_once_with(expected_report)

    def test_detect_and_optionally_merge(self):
        original_content = deepcopy(self.validation.eload_cfg.content)
        analysis_alias = 'alias'
        valid_files = ['file1', 'file2']
        merged_files = {analysis_alias: 'merged.vcf.gz'}
        self.validation.eload_cfg.set('validation', 'valid', 'analyses', analysis_alias, 'vcf_files', value=valid_files)

        with patch('eva_submission.eload_validation.detect_merge_type', return_value=MergeType.HORIZONTAL), \
                patch.object(VCFMerger, 'horizontal_merge', return_value=merged_files):
            # Should detect merge type but not actually merge
            self.validation.detect_and_optionally_merge(False)
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'merge_type', analysis_alias),
                MergeType.HORIZONTAL.value
            )
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'valid', 'analyses', analysis_alias, 'vcf_files'),
                valid_files
            )
            # Should perform the merge
            self.validation.detect_and_optionally_merge(True)
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'valid', 'analyses', analysis_alias, 'vcf_files'),
                ['merged.vcf.gz']
            )
        self.validation.eload_cfg.content = original_content

    def test_merge_multiple_analyses(self):
        valid_files = {
            'horizontal': ['h1', 'h2'],
            'vertical': ['v1', 'v2'],
            'neither': ['n1', 'n2']
        }
        detections = [MergeType.HORIZONTAL, MergeType.VERTICAL, None]
        horiz_merged_files = {'horizontal': 'h.vcf.gz'}
        vert_merged_files = {'vertical': 'v.vcf.gz'}
        for analysis_alias, vcf_files in valid_files.items():
            self.validation.eload_cfg.set('validation', 'valid', 'analyses',
                                          analysis_alias, 'vcf_files', value=vcf_files)

        with patch('eva_submission.eload_validation.detect_merge_type', side_effect=detections), \
                patch.object(VCFMerger, 'horizontal_merge', return_value=horiz_merged_files), \
                patch.object(VCFMerger, 'vertical_merge', return_value=vert_merged_files):
            self.validation.detect_and_optionally_merge(True)
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'valid', 'analyses', 'horizontal', 'vcf_files'),
                ['h.vcf.gz']
            )
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'valid', 'analyses', 'vertical', 'vcf_files'),
                ['v.vcf.gz']
            )
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'valid', 'analyses', 'neither', 'vcf_files'),
                ['n1', 'n2']
            )

    def test_merge_multiple_analyses_same_name(self):
        valid_files = {
            'a!': ['h1', 'h2'],
            'a@': ['v1', 'v2'],
            'a2': ['n1', 'n2']
        }
        detections = [MergeType.HORIZONTAL, MergeType.VERTICAL, None]
        analyses_dict = {
            analysis_alias: {'vcf_files': vcf_files}
            for analysis_alias, vcf_files in valid_files.items()
        }
        self.validation.eload_cfg.set('validation', 'valid', 'analyses', value=analyses_dict)

        with patch('eva_submission.eload_validation.detect_merge_type', side_effect=detections):
            self.validation.detect_and_optionally_merge(True)
            # Valid files should be unchanged even though merge is detected
            self.assertEqual(self.validation.eload_cfg.query('validation', 'valid', 'analyses'), analyses_dict)
            self.assertEqual(
                self.validation.eload_cfg.query('validation', 'merge_errors'),
                ['Analysis aliases not valid as unique merged filenames']
            )
예제 #7
0
def main():
    validation_tasks = ['aggregation_check', 'assembly_check', 'vcf_check']
    forced_validation_tasks = ['metadata_check', 'sample_check']

    argparse = ArgumentParser(
        description='Prepare to process backlog study and validate VCFs.')
    argparse.add_argument('--eload',
                          required=True,
                          type=int,
                          help='The ELOAD number for this submission')
    argparse.add_argument(
        '--project_accession',
        required=False,
        type=str,
        help='Set this project instead of the one associated with this eload. '
        'Useful when the association is not set in the database. '
        'The project needs to exists in the DB.')
    argparse.add_argument(
        '--analysis_accessions',
        required=False,
        type=str,
        nargs='+',
        help=
        'Set these analysis instead of the ones associated with the project. '
        'Useful when wanting to use a subset of the analysis. '
        'The analyses need to exists in the DB.')
    argparse.add_argument(
        '--force_config',
        action='store_true',
        default=False,
        help='Overwrite the configuration file after backing it up.')
    argparse.add_argument(
        '--keep_config',
        action='store_true',
        default=False,
        help=
        'Keep the configuration file as it is and only run the validation on it.'
    )
    argparse.add_argument(
        '--validation_tasks',
        required=False,
        type=str,
        nargs='+',
        default=validation_tasks,
        choices=validation_tasks,
        help='task or set of tasks to perform during validation')
    argparse.add_argument(
        '--merge_per_analysis',
        action='store_true',
        default=False,
        help='Whether to merge vcf files per analysis if possible.')
    argparse.add_argument(
        '--set_as_valid',
        action='store_true',
        default=False,
        help=
        'Set the script to consider all validation tasks performed as valid in the final '
        'evaluation. This does not affect the actual report but only change the final '
        'evaluation')
    argparse.add_argument(
        '--report',
        action='store_true',
        default=False,
        help=
        'Set the script to only report the results based on previously run preparation.'
    )
    argparse.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Set the script to output logging information at debug level')

    args = argparse.parse_args()

    log_cfg.add_stdout_handler()
    if args.debug:
        log_cfg.set_log_level(logging.DEBUG)

    # Load the config_file from default location
    load_config()

    with EloadBacklog(
            args.eload,
            project_accession=args.project_accession,
            analysis_accessions=args.analysis_accessions) as preparation:
        # Pass the eload config object to validation so that the two objects share the same state
        with EloadValidation(args.eload, preparation.eload_cfg) as validation:
            if not args.report and not args.keep_config:
                preparation.fill_in_config(args.force_config)

            if not args.report:
                validation.validate(args.validation_tasks)
                # Also mark the other validation tasks as force so they are all passable

                if args.set_as_valid:
                    forced_validation_tasks = validation.all_validation_tasks
                for validation_task in forced_validation_tasks:
                    validation.eload_cfg.set('validation',
                                             validation_task,
                                             'forced',
                                             value=True)
                validation.mark_valid_files_and_metadata(
                    args.merge_per_analysis)
                if args.merge_per_analysis:
                    preparation.copy_valid_config_to_brokering_after_merge()

            preparation.report()
            validation.report()
            logger.info(
                'Preparation complete, if files are valid please run ingestion as normal.'
            )