def extract_data(self): self.logger.info("extracting data") extractor = Extractor(self.train_destination) patents = get_files(join(self.working_dir, "patents"), ".XML") train_patent_list = [] test_patent_list = [] num_of_valid_patents = 0 num_of_unvalid_patents = 0 total_number_of_test_patents = 0 for patent in patents: self.logger.info("extracting " + patent) try: parsed_patent = extractor.parse(patent) if self.is_patent_valid(parsed_patent): num_of_valid_patents += 1 if len(test_patent_list) % 1000 == 0: self.logger.info("train_patent_list has length %d" % (len(train_patent_list))) if randint(1, 10) == 10: # 10% szansy test_patent_list.append(parsed_patent) total_number_of_test_patents += 1 else: train_patent_list.append(parsed_patent) else: num_of_unvalid_patents += 1 except Exception as e: self.logger.error(e.message) self.logger.error( "Number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents)) self.save_list(test_patent_list, self.test_destination) self.save_list(train_patent_list, self.train_destination) self.logger.info( "Final number of valid patents was %d, number of unvalid patents was %d" % (num_of_valid_patents, num_of_unvalid_patents)) self.logger.info("Total number of test examples is %d" % (total_number_of_test_patents))
def setUp(self): self.extractor = Extractor("test_data")