class DocumentTest(unittest.TestCase): file_name = u'models/test_data/lorem.json' meta = {'title': u'test 稢綌', 'author': u'gorden 胇赲'} body = u'In id tristique orci. 痵痽 犵艿邔 疿疶砳 齸圞趲.' pre_file_name = file_name + '_PRE.json' raw_file_name = file_name def setUp(self): self.doc = Document(file_name=self.file_name, metadata=self.meta, pre_file_name=self.pre_file_name, raw_file_name=self.raw_file_name) def test_clone(self): """ Test cloning a document """ doc_cloned = self.doc.clone() self.assertEqual(doc_cloned.file_name, self.doc.file_name) self.assertEqual(doc_cloned.pre_file_name, self.doc.pre_file_name) self.assertEqual(doc_cloned.raw_file_name, self.doc.raw_file_name) self.assertEqual(doc_cloned.metadata, self.doc.metadata) self.assertEqual(doc_cloned.raw_body, self.doc.raw_body) self.assertEqual(self.doc, doc_cloned) # using assertFalse instead of assertNotEqual in order to # test __eq__ doc_cloned.file_name = u'nope' self.assertFalse(self.doc == doc_cloned) doc_cloned.file_name = self.doc.file_name doc_cloned.metadata = None self.assertFalse(self.doc == doc_cloned) doc_cloned.metadata = self.doc.metadata doc_cloned.raw_file_name = '' self.assertFalse(self.doc == doc_cloned) doc_cloned.raw_file_name = self.doc.raw_file_name doc_cloned.pre_file_name = '' self.assertFalse(self.doc == doc_cloned) def test_to_dict(self): """ Test conversion to dictionary (for json serialization) """ doc_dict = self.doc.to_dict() self.assertEqual(doc_dict['file_name'], self.doc.file_name) self.assertEqual(doc_dict['metadata'], self.doc.metadata) self.assertEqual(doc_dict['pre_file_name'], self.doc.pre_file_name) # TODO check raw def test_open(self): """ Test opening a Document json """ self.assertRaises(InvalidDocumentException, Document.from_json, 'models/test_data/invalid.json') self.assertRaises(InvalidDocumentException, Document.from_json, 'models/test_data/invalid.txt')
def process(self): """ Perform processing Creates raw and preprocessed versions of the input file as well as a json file representing the models.Document """ start_time = time.time() name = path.get_name(self.file_name, extension=False) output_name = name + PREPROCESS_SUFFIX in_file = self.file_name out_file = os.path.join(self.output_dir, output_name) if file_ops.exists(out_file): # Already preprocessed return if in_file.endswith('.tei') or in_file.endswith('.xml'): reader = TEIReader(in_file) raw_text, metadata = reader.read() else: raw_text = file_ops.read_utf8(in_file) metadata = {} raw_file = os.path.join(self.output_dir, 'raw' + os.sep, name + PLAIN_SUFFIX) file_ops.write_utf8(raw_file, raw_text) processed_text = self.standardizer.standardize(raw_text) pre_file = os.path.join(self.output_dir, 'pre' + os.sep, name + PLAIN_SUFFIX) file_ops.write_utf8(pre_file, processed_text) out_document = Document(file_name=self.file_name, raw_file_name=raw_file, pre_file_name=pre_file, metadata=metadata) processed_dict = out_document.to_dict() file_ops.write_json_utf8(out_file, processed_dict) duration = time.time() - start_time self._log_duration(duration, self.file_name, len(raw_text))
class DocumentTest(unittest.TestCase): file_name = u'models/test_data/lorem.json' meta = {'title': u'test 稢綌', 'author': u'gorden 胇赲' } body = u'In id tristique orci. 痵痽 犵艿邔 疿疶砳 齸圞趲.' pre_file_name = file_name + '_PRE.json' raw_file_name = file_name def setUp(self): self.doc = Document(file_name=self.file_name, metadata=self.meta, pre_file_name=self.pre_file_name, raw_file_name=self.raw_file_name) def test_clone(self): """ Test cloning a document """ doc_cloned = self.doc.clone() self.assertEqual(doc_cloned.file_name, self.doc.file_name) self.assertEqual(doc_cloned.pre_file_name, self.doc.pre_file_name) self.assertEqual(doc_cloned.raw_file_name, self.doc.raw_file_name) self.assertEqual(doc_cloned.metadata, self.doc.metadata) self.assertEqual(doc_cloned.raw_body, self.doc.raw_body) self.assertEqual(self.doc, doc_cloned) # using assertFalse instead of assertNotEqual in order to # test __eq__ doc_cloned.file_name = u'nope' self.assertFalse(self.doc == doc_cloned) doc_cloned.file_name = self.doc.file_name doc_cloned.metadata = None self.assertFalse(self.doc == doc_cloned) doc_cloned.metadata = self.doc.metadata doc_cloned.raw_file_name = '' self.assertFalse(self.doc == doc_cloned) doc_cloned.raw_file_name = self.doc.raw_file_name doc_cloned.pre_file_name = '' self.assertFalse(self.doc == doc_cloned) def test_to_dict(self): """ Test conversion to dictionary (for json serialization) """ doc_dict = self.doc.to_dict() self.assertEqual(doc_dict['file_name'], self.doc.file_name) self.assertEqual(doc_dict['metadata'], self.doc.metadata) self.assertEqual(doc_dict['pre_file_name'], self.doc.pre_file_name) # TODO check raw def test_open(self): """ Test opening a Document json """ self.assertRaises(InvalidDocumentException, Document.from_json, 'models/test_data/invalid.json') self.assertRaises(InvalidDocumentException, Document.from_json, 'models/test_data/invalid.txt')