def _create_text_files(self): """ Creates test set text files from SGML or XML files. If files are already in text format, do nothing. """ if self.file_format == TEXT_FILE: return elif self.file_format == SGML_FILE: for sgml_file in (self.ref_file, self.src_file): sgml_path = str(sgml_file.name) text_path = sgml_path.replace('.sgm', '.txt') if not Path(text_path).exists(): process_to_text(sgml_path, text_path) elif self.file_format == XML_FILE: # Extract source text src_path = str(self.src_file.name) txt_path = src_path.replace('.xml', '.txt') if not Path(txt_path).exists(): # After validation it's guaranteed that src_langs has only one element src_langs, _, _, _ = analyze_xml_file(src_path) process_xml_to_text(src_path, txt_path, source=src_langs.pop()) # Extract reference texts; multiple references will be tab-separated ref_path = str(self.ref_file.name) txt_path = ref_path.replace('.xml', '.txt') if not Path(txt_path).exists(): _, _, translators, _ = analyze_xml_file(ref_path) # Sort to guarantee reproducibility # Scores will be computed against the first reference only translator = sorted(list(translators))[0] process_xml_to_text(ref_path, txt_path, reference=translator)
def test_analyze_xml_file_with_hypothesis(self): """Checks if systems can be found in XML format.""" xml_path = TESTDATA_DIR + '/xml/sample-hyp.xml' src_langs, _, _, systems = analyze_xml_file(xml_path) self.assertSetEqual(src_langs, set(['en'])) self.assertSetEqual(systems, set(['test-team']))
def test_analyze_xml_file_with_multi_reference_testset(self): """Checks if multiple references can be found in XML format.""" xml_path = TESTDATA_DIR + '/xml/sample-src-multirefs.xml' src_langs, ref_langs, translators, _ = analyze_xml_file(xml_path) self.assertSetEqual(src_langs, set(['en'])) self.assertSetEqual(ref_langs, set(['ha'])) self.assertSetEqual(translators, set(['A', 'B']))
def test_analyze_xml_file_with_testset(self): """Checks if source and reference can be found in XML format.""" xml_path = TESTDATA_DIR + '/xml/sample-src-ref.xml' src_langs, ref_langs, translators, _ = analyze_xml_file(xml_path) self.assertSetEqual(src_langs, set(['en'])) self.assertSetEqual(ref_langs, set(['ha'])) self.assertSetEqual(translators, set(['A']))
def get_hyp_text(self, path_only=False): """Returns a list of hypothesis segments. Args: path_only (bool): Return a path to the hypothesis file instead of a list of hypothesis segments Returns: list/str: A list of segments unless path_only and a file path otherwise """ hyp_path = self.hyp_file.name if self.file_format == SGML_FILE: if self.test_set.file_format == SGML_FILE: hyp_filtered_path = hyp_path.replace('.sgm', '.filtered.sgm') if not Path(hyp_filtered_path).exists(): # Get docids from ref SGML path -- these are non "testsuite-" ref_docids = Submission._get_docids_from_path( self.test_set.ref_file.name) # Filter hyp SGML in matching order, skipping testsuite-* docs hyp_filtered_path = Submission._filter_sgml_by_docids( self.hyp_file.name, ref_docids, ) else: hyp_filtered_path = hyp_path # Create text version of (possibly filtered) hyp SGML hyp_text_path = hyp_filtered_path.replace('.sgm', '.txt') if not Path(hyp_text_path).exists(): process_to_text(hyp_filtered_path, hyp_text_path) elif self.file_format == XML_FILE: hyp_text_path = hyp_path.replace('.xml', '.txt') if not Path(hyp_text_path).exists(): _, _, _, sys_names = analyze_xml_file(hyp_path) # It should never happen that there is no system translations # thanks to validation, but better to check if len(sys_names) > 0: process_xml_to_text(hyp_path, hyp_text_path, system=sys_names.pop()) elif self.file_format == TEXT_FILE: hyp_text_path = hyp_path if path_only: return hyp_text_path else: return (x for x in open(hyp_text_path, encoding='utf-8'))
def validate_xml_ref_testset(xml_file): """Validate reference texts in XML file.""" if not xml_file.name.endswith('.xml'): return # Skip validation for other formats _, ref_langs, translators, _ = analyze_xml_file(xml_file) if len(ref_langs) == 0 or len(translators) == 0: _msg = 'No reference found in the XML file {0}'.format(xml_file.name) raise ValidationError(_msg) if len(ref_langs) > 1: _msg = ( 'XML files with multiple reference languages are not supported') raise ValidationError(_msg)
def validate_xml_src_testset(xml_file): """Validate source texts in XML file.""" if not xml_file.name.endswith('.xml'): return # Skip validation for other formats src_langs, _, _, _ = analyze_xml_file(xml_file) if len(src_langs) == 0: _msg = 'No source language found in the XML file {0}'.format( xml_file.name) raise ValidationError(_msg) if len(src_langs) > 1: _msg = 'XML files with multiple source languages are not supported' raise ValidationError(_msg)
def validate_xml_submission(xml_file): """Validate submissions in XML format.""" if not xml_file.name.endswith('.xml'): return # Skip validation for other formats validate_xml_schema(xml_file) xml_file.seek(0) # To be able to read() again # Check if the submission has some translations from one system only _, _, _, systems = analyze_xml_file(xml_file) if len(systems) == 0: _msg = 'No system found in the XML file {0}'.format(xml_file.name) raise ValidationError(_msg) if len(systems) > 1: _msg = 'XML files with multiple systems are not supported' raise ValidationError(_msg)