예제 #1
0
파일: tests.py 프로젝트: gnarph/DIRT
class DocumentTest(unittest.TestCase):
    file_name = u'models/test_data/lorem.json'
    meta = {'title': u'test 稢綌', 'author': u'gorden 胇赲'}
    body = u'In id tristique orci. 痵痽 犵艿邔 疿疶砳 齸圞趲.'
    pre_file_name = file_name + '_PRE.json'
    raw_file_name = file_name

    def setUp(self):
        self.doc = Document(file_name=self.file_name,
                            metadata=self.meta,
                            pre_file_name=self.pre_file_name,
                            raw_file_name=self.raw_file_name)

    def test_clone(self):
        """
        Test cloning a document
        """
        doc_cloned = self.doc.clone()
        self.assertEqual(doc_cloned.file_name, self.doc.file_name)
        self.assertEqual(doc_cloned.pre_file_name, self.doc.pre_file_name)
        self.assertEqual(doc_cloned.raw_file_name, self.doc.raw_file_name)
        self.assertEqual(doc_cloned.metadata, self.doc.metadata)
        self.assertEqual(doc_cloned.raw_body, self.doc.raw_body)
        self.assertEqual(self.doc, doc_cloned)

        # using assertFalse instead of assertNotEqual in order to
        # test __eq__
        doc_cloned.file_name = u'nope'
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.file_name = self.doc.file_name
        doc_cloned.metadata = None
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.metadata = self.doc.metadata
        doc_cloned.raw_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.raw_file_name = self.doc.raw_file_name
        doc_cloned.pre_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

    def test_to_dict(self):
        """
        Test conversion to dictionary (for json serialization)
        """
        doc_dict = self.doc.to_dict()
        self.assertEqual(doc_dict['file_name'], self.doc.file_name)
        self.assertEqual(doc_dict['metadata'], self.doc.metadata)
        self.assertEqual(doc_dict['pre_file_name'], self.doc.pre_file_name)
        # TODO check raw

    def test_open(self):
        """
        Test opening a Document json
        """
        self.assertRaises(InvalidDocumentException, Document.from_json,
                          'models/test_data/invalid.json')
        self.assertRaises(InvalidDocumentException, Document.from_json,
                          'models/test_data/invalid.txt')
예제 #2
0
파일: preprocessor.py 프로젝트: gnarph/DIRT
    def process(self):
        """
        Perform processing
        Creates raw and preprocessed versions of the input file
        as well as a json file representing the models.Document
        """
        start_time = time.time()
        name = path.get_name(self.file_name, extension=False)
        output_name = name + PREPROCESS_SUFFIX
        in_file = self.file_name
        out_file = os.path.join(self.output_dir, output_name)
        if file_ops.exists(out_file):
            # Already preprocessed
            return

        if in_file.endswith('.tei') or in_file.endswith('.xml'):
            reader = TEIReader(in_file)
            raw_text, metadata = reader.read()
        else:
            raw_text = file_ops.read_utf8(in_file)
            metadata = {}

        raw_file = os.path.join(self.output_dir,
                                'raw' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(raw_file, raw_text)

        processed_text = self.standardizer.standardize(raw_text)
        pre_file = os.path.join(self.output_dir,
                                'pre' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(pre_file, processed_text)

        out_document = Document(file_name=self.file_name,
                                raw_file_name=raw_file,
                                pre_file_name=pre_file,
                                metadata=metadata)
        processed_dict = out_document.to_dict()
        file_ops.write_json_utf8(out_file, processed_dict)

        duration = time.time() - start_time
        self._log_duration(duration, self.file_name, len(raw_text))
예제 #3
0
    def process(self):
        """
        Perform processing
        Creates raw and preprocessed versions of the input file
        as well as a json file representing the models.Document
        """
        start_time = time.time()
        name = path.get_name(self.file_name, extension=False)
        output_name = name + PREPROCESS_SUFFIX
        in_file = self.file_name
        out_file = os.path.join(self.output_dir, output_name)
        if file_ops.exists(out_file):
            # Already preprocessed
            return

        if in_file.endswith('.tei') or in_file.endswith('.xml'):
            reader = TEIReader(in_file)
            raw_text, metadata = reader.read()
        else:
            raw_text = file_ops.read_utf8(in_file)
            metadata = {}

        raw_file = os.path.join(self.output_dir, 'raw' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(raw_file, raw_text)

        processed_text = self.standardizer.standardize(raw_text)
        pre_file = os.path.join(self.output_dir, 'pre' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(pre_file, processed_text)

        out_document = Document(file_name=self.file_name,
                                raw_file_name=raw_file,
                                pre_file_name=pre_file,
                                metadata=metadata)
        processed_dict = out_document.to_dict()
        file_ops.write_json_utf8(out_file, processed_dict)

        duration = time.time() - start_time
        self._log_duration(duration, self.file_name, len(raw_text))
예제 #4
0
파일: tests.py 프로젝트: gnarph/DIRT
class DocumentTest(unittest.TestCase):
    file_name = u'models/test_data/lorem.json'
    meta = {'title': u'test 稢綌',
            'author': u'gorden 胇赲'
            }
    body = u'In id tristique orci. 痵痽 犵艿邔 疿疶砳 齸圞趲.'
    pre_file_name = file_name + '_PRE.json'
    raw_file_name = file_name

    def setUp(self):
        self.doc = Document(file_name=self.file_name,
                            metadata=self.meta,
                            pre_file_name=self.pre_file_name,
                            raw_file_name=self.raw_file_name)

    def test_clone(self):
        """
        Test cloning a document
        """
        doc_cloned = self.doc.clone()
        self.assertEqual(doc_cloned.file_name, self.doc.file_name)
        self.assertEqual(doc_cloned.pre_file_name, self.doc.pre_file_name)
        self.assertEqual(doc_cloned.raw_file_name, self.doc.raw_file_name)
        self.assertEqual(doc_cloned.metadata, self.doc.metadata)
        self.assertEqual(doc_cloned.raw_body, self.doc.raw_body)
        self.assertEqual(self.doc, doc_cloned)

        # using assertFalse instead of assertNotEqual in order to
        # test __eq__
        doc_cloned.file_name = u'nope'
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.file_name = self.doc.file_name
        doc_cloned.metadata = None
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.metadata = self.doc.metadata
        doc_cloned.raw_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

        doc_cloned.raw_file_name = self.doc.raw_file_name
        doc_cloned.pre_file_name = ''
        self.assertFalse(self.doc == doc_cloned)

    def test_to_dict(self):
        """
        Test conversion to dictionary (for json serialization)
        """
        doc_dict = self.doc.to_dict()
        self.assertEqual(doc_dict['file_name'], self.doc.file_name)
        self.assertEqual(doc_dict['metadata'], self.doc.metadata)
        self.assertEqual(doc_dict['pre_file_name'], self.doc.pre_file_name)
        # TODO check raw

    def test_open(self):
        """
        Test opening a Document json
        """
        self.assertRaises(InvalidDocumentException,
                          Document.from_json,
                          'models/test_data/invalid.json')
        self.assertRaises(InvalidDocumentException,
                          Document.from_json,
                          'models/test_data/invalid.txt')