def run(self): """ Begins processing all XML files in the specified directory. """ for filename in os.listdir(self.__doc_dir): doc_id, extension = os.path.splitext(filename) if extension.lower() != '.xml': print 'Ignoring file: {} Reason: Not an XML document.'\ .format(filename) continue full_path = self.__doc_dir + filename patent_info = utils.xml_file_to_dict(full_path) self.__process_patent(doc_id, patent_info) self.__indexer.serialize()
def run(self): """ Begins downloading thesaurus. """ words = [] for filename in os.listdir(self.__doc_dir): doc_id, extension = os.path.splitext(filename) if extension.lower() != '.xml': print 'Ignoring file: {} Reason: Not an XML document.'\ .format(filename) continue full_path = self.__doc_dir + filename patent_info = utils.xml_file_to_dict(full_path) words += self.__process_patent(doc_id, patent_info) unique_words = list(set(words)) nouns = extract_nouns_and_adjectives(unique_words) AltervistaThesaurus.build_thesaurus(nouns, self.__out_file)