예제 #1
0
    def extract_data(self):
        self.logger.info("extracting data")
        extractor = Extractor(self.train_destination)
        patents = get_files(join(self.working_dir, "patents"), ".XML")
        train_patent_list = []
        test_patent_list = []
        num_of_valid_patents = 0
        num_of_unvalid_patents = 0
        total_number_of_test_patents = 0

        for patent in patents:
            self.logger.info("extracting " + patent)
            try:
                parsed_patent = extractor.parse(patent)
                if self.is_patent_valid(parsed_patent):
                    num_of_valid_patents += 1
                    if len(test_patent_list) % 1000 == 0:
                        self.logger.info("train_patent_list has length %d" %
                                         (len(train_patent_list)))
                    if randint(1, 10) == 10:  # 10% szansy
                        test_patent_list.append(parsed_patent)
                        total_number_of_test_patents += 1
                    else:
                        train_patent_list.append(parsed_patent)
                else:
                    num_of_unvalid_patents += 1
            except Exception as e:
                self.logger.error(e.message)
                self.logger.error(
                    "Number of valid patents was %d, number of unvalid patents was %d"
                    % (num_of_valid_patents, num_of_unvalid_patents))
        self.save_list(test_patent_list, self.test_destination)
        self.save_list(train_patent_list, self.train_destination)
        self.logger.info(
            "Final number of valid patents was %d, number of unvalid patents was %d"
            % (num_of_valid_patents, num_of_unvalid_patents))
        self.logger.info("Total number of test examples is %d" %
                         (total_number_of_test_patents))
예제 #2
0
 def setUp(self):
     self.extractor = Extractor("test_data")