def extract_data(self): self.logger.info("extracting data") extractor = Extractor(self.train_destination) patents = get_files(join(self.working_dir, "patents"), ".XML") train_patent_list = [] test_patent_list = [] num_of_valid_patents = 0 num_of_unvalid_patents = 0 total_number_of_test_patents = 0 for patent in patents: self.logger.info("extracting " + patent) try: parsed_patent = extractor.parse(patent) if self.is_patent_valid(parsed_patent): num_of_valid_patents += 1 if len(test_patent_list) % 1000 == 0: self.logger.info("train_patent_list has length %d" % (len(train_patent_list))) if randint(1, 10) == 10: # 10% szansy test_patent_list.append(parsed_patent) total_number_of_test_patents += 1 else: train_patent_list.append(parsed_patent) else: num_of_unvalid_patents += 1 except Exception as e: self.logger.error(e.message) self.logger.error("Number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents)) self.save_list(test_patent_list, self.test_destination) self.save_list(train_patent_list, self.train_destination) self.logger.info("Final number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents)) self.logger.info("Total number of test examples is %d" % (total_number_of_test_patents))
def extract_data(self): self.logger.info("extracting data") extractor = Extractor(self.train_destination) patents = get_files(join(self.working_dir, "patents"), ".XML") train_patent_list = [] test_patent_list = [] num_of_valid_patents = 0 num_of_unvalid_patents = 0 total_number_of_test_patents = 0 for patent in patents: self.logger.info("extracting " + patent) try: parsed_patent = extractor.parse(patent) if self.is_patent_valid(parsed_patent): num_of_valid_patents += 1 if len(test_patent_list) % 1000 == 0: self.logger.info("train_patent_list has length %d" % (len(train_patent_list))) if randint(1, 10) == 10: # 10% szansy test_patent_list.append(parsed_patent) total_number_of_test_patents += 1 else: train_patent_list.append(parsed_patent) else: num_of_unvalid_patents += 1 except Exception as e: self.logger.error(e.message) self.logger.error( "Number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents)) self.save_list(test_patent_list, self.test_destination) self.save_list(train_patent_list, self.train_destination) self.logger.info( "Final number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents)) self.logger.info("Total number of test examples is %d" % (total_number_of_test_patents))
class TestExtractor(unittest.TestCase): def setUp(self): self.extractor = Extractor("test_data") def tearDown(self): pass def test_should_load_json_file(self): self.assertIsNotNone(self.extractor.structure["us-patent-grant-v44-2013-05-16.dtd"]["documentID"]) def test_xpaths(self): inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-20131224.XML") tree = ET.parse(inputfile) root = tree.getroot() dtdStructure = self.extractor.structure[tree.docinfo.internalDTD.system_url] patent = self.extractor.parse(inputfile) self.assertEqual(patent.documentID, root.findall(dtdStructure["documentID"])[0].text) self.assertEqual(patent.title, root.findall(dtdStructure["inventionTitle"])[0].text) self.assertEqual(patent.date, root.findall(dtdStructure["date"])[0].text) self.assertIsNotNone(patent.abstract) self.assertIsNotNone(patent.description) self.assertIsNotNone(patent.claims) def test_xml_structures(self): inputfiles = ["US08613112-20131224.XML", "US08927386-20150106.XML"] for inputfile in inputfiles: patent = self.extractor.parse(resource_filename("patent_parsing_tools.tests", inputfile)) self.assertIsNotNone(patent.documentID) self.assertIsNotNone(patent.title) self.assertIsNotNone(patent.date) self.assertIsNotNone(patent.abstract) self.assertIsNotNone(patent.description) self.assertIsNotNone(patent.claims) def test_exception_not_supported_xml_structure(self): inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-noDTDFile.XML") self.assertRaises(NotSupportedDTDConfiguration, self.extractor.parse, inputfile) def test_exception_not_implemented_dtd_structure(self): inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-notSupportedDTD.XML") self.assertRaises(NotSupportedDTDConfiguration, self.extractor.parse, inputfile) def test_no_exception_when_lack_of_node(self): inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-lackofnode.XML") self.extractor.parse(inputfile) def test_throw_exception_and_go_through(self): inputfile = resource_filename("patent_parsing_tools.tests", "US08613112-noDTDFile.XML") try: self.extractor.parse(resource_filename("patent_parsing_tools.tests", "US08613112-noDTDFile.XML")) except NotSupportedDTDConfiguration as r: print "Catched first Exception with message: \"" + r.message + "\"" try: self.extractor.parse(resource_filename("patent_parsing_tools.tests", "US08613112-notSupportedDTD.XML")) except NotSupportedDTDConfiguration as r: print "Catched second Exception with message: \"" + r.message + "\""
def setUp(self): self.extractor = Extractor("test_data")