def search(self, site): # スレッド検索 subjects = Subject.get_from_url(site) method = getattr(self, site.title) subjects_dict = method(subjects, site) for key in subjects_dict: print(subjects_dict[key])
def run(self, force=None): print('start') # 全サイト取得と重複排除 sites = {} for site in Site.get_all(): sites[site.url] = site # リストに対してignoreとkeywordマッチを排除 sure = [] for key in sites: site = sites[key] response = requests.get(site.subjects_url) assert (response.status_code == 200), response.text # parse data = list(response.text.split('\n')) for line in data: try: _ = Subject(site, line) sure.append(_) except: pass print(sure) # リスト出力 t = Tokenizer() r = defaultdict(int) r2 = defaultdict(list) r3 = defaultdict(int) for _sure in sure: try: for token in t.tokenize(_sure.title): if not token_filter(token): r[token.surface] += 1 r2[token.surface] += [_sure] r3[token] += 0 except: pass # sort sure = sorted(sure, key=lambda x: x.title) for _sure in sure: try: point = 0 for token in t.tokenize(_sure.title): if not token_filter(token): point += r[token.surface] if not filter_title(point, _sure): print(_sure.title, _sure.count_res) except: pass
def search_and_scraping(self, site, force=None): # スレッド検索 subjects = Subject.get_from_url(site) method = getattr(self, site.title) subjects_dict = method(subjects, site) # スクレイピング for key in subjects_dict: sub = subjects_dict[key] sub.execute_matome(force=force) # 参照を切る method = None del method return subjects_dict