for subtype in page.subtypes: self.graph.add_edge(subtype, page.query_task(), relation='subtype-of') for task in page.tasks: pdb.set_trace() self.graph.add_edge(task.task_name(), subtype) def _add_non_subtype_page(self, page): for task in page.tasks: pdb.set_trace() self.graph.add_edge(task.task_name(), page.query_task()) def show_graph(self): nx.draw(self.graph) plt.show() plt.savefig("path.png") if __name__ == '__main__': import constants from pickle_file_loader_for_original import PickleFileLoaderForOriginal queries = constants.QUERIES_4 for query in queries: generator = TaskGraphGenerator() pfl = PickleFileLoaderForOriginal() pages = pfl.load_fetched_pages_with_query(query) for i, page in enumerate(pages): if i > 100: break generator.add_page(page) generator.show_graph()
# -*- coding: utf-8 -*- import constants from pickle_file_loader_for_original import PickleFileLoaderForOriginal from pickle_file_saver_for_original import PickleFileSaverForOriginal from path_mover import PathMover from page_data_inserter import PageDataInserter import pdb if __name__ == '__main__': queries = constants.QUERIES_4 pfl = PickleFileLoaderForOriginal() saver = PickleFileSaverForOriginal() pm = PathMover() di = PageDataInserter() for i, query in enumerate(queries): pages = pfl.load_fetched_pages_with_query(query) pm.go_or_create_and_go_to(constants.FETCHED_PAGES_O_DIR_NAME) pm.go_or_create_and_go_to(query) for i, page in enumerate(pages): if '.pdf' in page.url: continue if di.has_body(page.query, page.url): print(str(i)) continue try: print('%i番目の%sのページをフェッチします' % (i, query)) page.fetch_html() print('%sのフェッチ完了!' % page.title) page.set_text_from_html_body() #page.set_sentences_from_text() #filename = '%s_%i.pkl' % (query, i)
# -*- coding: utf-8 -*- import pdb from pickle_file_loader_for_original import PickleFileLoaderForOriginal if __name__ == '__main__': loader = PickleFileLoaderForOriginal() pages = loader.load_fetched_pages_with_query('花粉症 を 対策する') task_set = set() for page in pages: tasks = page.tasks for task in tasks: task_set.add(task) print(len(task_set))