# -*- coding: utf-8 -*- import constants from pickle_file_loader_for_original import PickleFileLoaderForOriginal from pickle_file_saver_for_original import PickleFileSaverForOriginal from path_mover import PathMover from page_data_inserter import PageDataInserter import pdb if __name__ == '__main__': queries = constants.QUERIES_4 pfl = PickleFileLoaderForOriginal() saver = PickleFileSaverForOriginal() pm = PathMover() di = PageDataInserter() for i, query in enumerate(queries): pages = pfl.load_fetched_pages_with_query(query) pm.go_or_create_and_go_to(constants.FETCHED_PAGES_O_DIR_NAME) pm.go_or_create_and_go_to(query) for i, page in enumerate(pages): if '.pdf' in page.url: continue if di.has_body(page.query, page.url): print(str(i)) continue try: print('%i番目の%sのページをフェッチします' % (i, query)) page.fetch_html() print('%sのフェッチ完了!' % page.title) page.set_text_from_html_body() #page.set_sentences_from_text() #filename = '%s_%i.pkl' % (query, i)
for subtype in page.subtypes: self.graph.add_edge(subtype, page.query_task(), relation='subtype-of') for task in page.tasks: pdb.set_trace() self.graph.add_edge(task.task_name(), subtype) def _add_non_subtype_page(self, page): for task in page.tasks: pdb.set_trace() self.graph.add_edge(task.task_name(), page.query_task()) def show_graph(self): nx.draw(self.graph) plt.show() plt.savefig("path.png") if __name__ == '__main__': import constants from pickle_file_loader_for_original import PickleFileLoaderForOriginal queries = constants.QUERIES_4 for query in queries: generator = TaskGraphGenerator() pfl = PickleFileLoaderForOriginal() pages = pfl.load_fetched_pages_with_query(query) for i, page in enumerate(pages): if i > 100: break generator.add_page(page) generator.show_graph()
#coding: utf-8 import constants from sentence import Sentence from task_data_inserter import TaskDataInserter from pickle_file_loader_for_original import PickleFileLoaderForOriginal from sentence_data_loader import SentenceDataLoader import sqlite3 import pdb if __name__ == '__main__': pfl = PickleFileLoaderForOriginal() di = TaskDataInserter() loader = SentenceDataLoader() for i_sentence in range(450000): try: body = loader.body_with_id(i_sentence + 1) except EOFError: continue if not body: continue sentence = Sentence(body, 'a') if sentence.set_noun_verb_if_good_task(): di.insert(noun=sentence.noun, cmp=sentence.cmp, verb=sentence.verb, sentence_id=i_sentence + 1)
# -*- coding: utf-8 -*- from pickle_file_loader_for_ex import PickleFileLoaderForExpandedQuery from pickle_file_loader_for_original import PickleFileLoaderForOriginal from task_graph_zero_answerer import TaskGraphZeroAnswerer from answer_printer import AnswererPrinter from path_mover import PathMover import constants import pdb if __name__ == '__main__': queries = constants.QUERIES_4 for query in queries: pfl = PickleFileLoaderForOriginal() g = pfl.load_graph_with_query(query) noun, cmp, verb = query.split(' ') query_task = '_'.join([noun, cmp, verb]) pm = PathMover() print('zeroの結果です') answerer = TaskGraphZeroAnswerer(graph=g, query_task=query_task) print('zero_answererをinstance化しました') answerer.set_result_tasks() print('set_result_tasks') answerer.set_task_scores() answerer.remove_generalized_tasks() print('set_task_scores') answerer.set_united_results() simple_results = [] for united_result in answerer.united_results:
# -*- coding: utf-8 -*- from pickle_file_loader_for_ex import PickleFileLoaderForExpandedQuery from pickle_file_loader_for_original import PickleFileLoaderForOriginal from task_graph_first_answerer import TaskGraphFirstAnswerer from answer_printer import AnswererPrinter from path_mover import PathMover import constants import pdb if __name__ == '__main__': queries = constants.QUERIES_4 for query in queries: #pfl = PickleFileLoaderForExpandedQuery() pfl = PickleFileLoaderForOriginal() g = pfl.load_graph_with_query(query) print('ロードしました') noun, cmp, verb = query.split(' ') query_task = '_'.join([noun, cmp, verb]) if not g: print('%sのグラフが存在しません!' % query) pdb.set_trace() continue if not g.nodes(): print('%sのグラフに異常があります' % query) continue query_task = '_'.join(query.split(' ')) # answererがいらないノードをremoveしてくれてるはず
# -*- coding: utf-8 -*- import pdb from pickle_file_loader_for_original import PickleFileLoaderForOriginal if __name__ == '__main__': loader = PickleFileLoaderForOriginal() pages = loader.load_fetched_pages_with_query('花粉症 を 対策する') task_set = set() for page in pages: tasks = page.tasks for task in tasks: task_set.add(task) print(len(task_set))