예제 #1
0
    def _create_text_files(self):
        """
        Creates test set text files from SGML or XML files.
        If files are already in text format, do nothing.
        """
        if self.file_format == TEXT_FILE:
            return

        elif self.file_format == SGML_FILE:
            for sgml_file in (self.ref_file, self.src_file):
                sgml_path = str(sgml_file.name)
                text_path = sgml_path.replace('.sgm', '.txt')
                if not Path(text_path).exists():
                    process_to_text(sgml_path, text_path)

        elif self.file_format == XML_FILE:
            # Extract source text
            src_path = str(self.src_file.name)
            txt_path = src_path.replace('.xml', '.txt')

            if not Path(txt_path).exists():
                # After validation it's guaranteed that src_langs has only one element
                src_langs, _, _, _ = analyze_xml_file(src_path)
                process_xml_to_text(src_path, txt_path, source=src_langs.pop())

            # Extract reference texts; multiple references will be tab-separated
            ref_path = str(self.ref_file.name)
            txt_path = ref_path.replace('.xml', '.txt')

            if not Path(txt_path).exists():
                _, _, translators, _ = analyze_xml_file(ref_path)
                # Sort to guarantee reproducibility
                # Scores will be computed against the first reference only
                translator = sorted(list(translators))[0]
                process_xml_to_text(ref_path, txt_path, reference=translator)
예제 #2
0
    def test_analyze_xml_file_with_hypothesis(self):
        """Checks if systems can be found in XML format."""
        xml_path = TESTDATA_DIR + '/xml/sample-hyp.xml'
        src_langs, _, _, systems = analyze_xml_file(xml_path)

        self.assertSetEqual(src_langs, set(['en']))
        self.assertSetEqual(systems, set(['test-team']))
예제 #3
0
    def test_analyze_xml_file_with_multi_reference_testset(self):
        """Checks if multiple references can be found in XML format."""
        xml_path = TESTDATA_DIR + '/xml/sample-src-multirefs.xml'
        src_langs, ref_langs, translators, _ = analyze_xml_file(xml_path)

        self.assertSetEqual(src_langs, set(['en']))
        self.assertSetEqual(ref_langs, set(['ha']))
        self.assertSetEqual(translators, set(['A', 'B']))
예제 #4
0
    def test_analyze_xml_file_with_testset(self):
        """Checks if source and reference can be found in XML format."""
        xml_path = TESTDATA_DIR + '/xml/sample-src-ref.xml'
        src_langs, ref_langs, translators, _ = analyze_xml_file(xml_path)

        self.assertSetEqual(src_langs, set(['en']))
        self.assertSetEqual(ref_langs, set(['ha']))
        self.assertSetEqual(translators, set(['A']))
예제 #5
0
    def get_hyp_text(self, path_only=False):
        """Returns a list of hypothesis segments.

        Args:
            path_only (bool): Return a path to the hypothesis file instead of
                a list of hypothesis segments

        Returns:
            list/str: A list of segments unless path_only and a file path
                otherwise
        """

        hyp_path = self.hyp_file.name

        if self.file_format == SGML_FILE:
            if self.test_set.file_format == SGML_FILE:
                hyp_filtered_path = hyp_path.replace('.sgm', '.filtered.sgm')
                if not Path(hyp_filtered_path).exists():
                    # Get docids from ref SGML path -- these are non "testsuite-"
                    ref_docids = Submission._get_docids_from_path(
                        self.test_set.ref_file.name)

                    # Filter hyp SGML in matching order, skipping testsuite-* docs
                    hyp_filtered_path = Submission._filter_sgml_by_docids(
                        self.hyp_file.name,
                        ref_docids,
                    )
            else:
                hyp_filtered_path = hyp_path

            # Create text version of (possibly filtered) hyp SGML
            hyp_text_path = hyp_filtered_path.replace('.sgm', '.txt')
            if not Path(hyp_text_path).exists():
                process_to_text(hyp_filtered_path, hyp_text_path)

        elif self.file_format == XML_FILE:
            hyp_text_path = hyp_path.replace('.xml', '.txt')
            if not Path(hyp_text_path).exists():
                _, _, _, sys_names = analyze_xml_file(hyp_path)
                # It should never happen that there is no system translations
                # thanks to validation, but better to check
                if len(sys_names) > 0:
                    process_xml_to_text(hyp_path,
                                        hyp_text_path,
                                        system=sys_names.pop())

        elif self.file_format == TEXT_FILE:
            hyp_text_path = hyp_path

        if path_only:
            return hyp_text_path
        else:
            return (x for x in open(hyp_text_path, encoding='utf-8'))
예제 #6
0
def validate_xml_ref_testset(xml_file):
    """Validate reference texts in XML file."""
    if not xml_file.name.endswith('.xml'):
        return  # Skip validation for other formats

    _, ref_langs, translators, _ = analyze_xml_file(xml_file)
    if len(ref_langs) == 0 or len(translators) == 0:
        _msg = 'No reference found in the XML file {0}'.format(xml_file.name)
        raise ValidationError(_msg)
    if len(ref_langs) > 1:
        _msg = (
            'XML files with multiple reference languages are not supported')
        raise ValidationError(_msg)
예제 #7
0
def validate_xml_src_testset(xml_file):
    """Validate source texts in XML file."""
    if not xml_file.name.endswith('.xml'):
        return  # Skip validation for other formats

    src_langs, _, _, _ = analyze_xml_file(xml_file)
    if len(src_langs) == 0:
        _msg = 'No source language found in the XML file {0}'.format(
            xml_file.name)
        raise ValidationError(_msg)
    if len(src_langs) > 1:
        _msg = 'XML files with multiple source languages are not supported'
        raise ValidationError(_msg)
예제 #8
0
def validate_xml_submission(xml_file):
    """Validate submissions in XML format."""
    if not xml_file.name.endswith('.xml'):
        return  # Skip validation for other formats

    validate_xml_schema(xml_file)
    xml_file.seek(0)  # To be able to read() again

    # Check if the submission has some translations from one system only
    _, _, _, systems = analyze_xml_file(xml_file)
    if len(systems) == 0:
        _msg = 'No system found in the XML file {0}'.format(xml_file.name)
        raise ValidationError(_msg)
    if len(systems) > 1:
        _msg = 'XML files with multiple systems are not supported'
        raise ValidationError(_msg)