def google_search(self, query, num): NUM = num url = 'https://www.googleapis.com/customsearch/v1?' params = { 'key': self.google_api_key, 'q': query, 'cx': '013036536707430787589:_pqjad5hr1a', 'alt': 'json', 'lr': 'lang_ja', } start = 1 items = [] for i in range(0, NUM): params['start'] = start request_url = url + urllib.parse.urlencode(params) try: response = urllib.request.urlopen(request_url) json_body = json.loads(response.read().decode('utf-8')) items.extend(json_body['items']) if not 'nextPage' in json_body['queries']: break start = json_body['queries']['nextPage'][0]['startIndex'] except: items.extend({'link': '#', 'title': '検索できませんでした'}) pages = [] for item in items: page = WebPage(item['link']) page.title = item['title'] page.snippet = item['snippet'] pages.append(page) return pages # => [{'link': 'http://...', 'title': 'ページは'}, {...}...]
def bing_search(self, query, num): NUM = num key = self.microsoft_api_key url = 'https://api.datamarket.azure.com/Bing/Search/Web?' json_param = '&$format=json' param = { 'Query': "'" + query + "'", } req_url = url + urllib.parse.urlencode(param) items = [] for i in range(0, NUM): try: json_body = requests.get(req_url + json_param, auth=(key, key)).json() items.extend(json_body['d']['results']) req_url = json_body['d']['__next'] except: items.extend({'Url': '#', 'Title': '検索できませんでした'}) pages = [] for item in items: page = WebPage(item['Url']) #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def yahoo_key_phrase(self, text): url = 'http://jlp.yahooapis.jp/KeyphraseService/V1/extract?appid=%s&sentence=%s' % ( self.yahoo_japan_app_id, text) result_page = WebPage(url) result_page.fetch_xml() key_phrases = result_page.pick_key_phrases() return key_phrases
def bing_search(self, query, num): key = self.microsoft_api_key url = 'https://api.datamarket.azure.com/Bing/Search/Web?' json_param = '&$format=json' param = {'Query': query} request_url = url + urllib.parse.urlencode(param) + json_param items = [] for i in range(0, num): try: json_body = requests.get(request_url, auth=(key, key), headers={ 'User-Agent': 'My API Robot' }).json() items.extend(json_body['d']['results']) request_url = json_body['d']['__next'] except: items.extend({'Url': '#', 'Title': '検索できませんでした'}) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def get_book_appointment_page_url(): user_applications_page = WebPage(url="%s%s" % (UKVISA_BASE_URL, USER_APPLICATIONS_PATH), handler=s) d = pq(user_applications_page.content()) return "%s%s" % (UKVISA_BASE_URL, d('a').attr('href').replace('..', ''))
def setUp(self): self.nanapi_article_page = WebPage() nanapi_file = open('test_support/nanapi.html', encoding='utf-8') nanapi_html = nanapi_file.read() nanapi_file.close() self.nanapi_article_page.html_body = nanapi_html self.nanapi_article_page.url = 'http://nanapi.jp' self.nanapi_hay_fever_page = WebPage('http://nanapi.jp') nanapi_hay_fever_file = open('test_support/nanapi_hay_fever.html', encoding='utf-8') nanapi_hay_fever_html = nanapi_hay_fever_file.read() nanapi_hay_fever_file.close() self.nanapi_hay_fever_page.html_body = nanapi_hay_fever_html self.gow_marriage_page = WebPage() gow_file = open('test_support/gow.html', encoding='utf-8') gow_html = gow_file.read() gow_file.close() self.gow_marriage_page.html_body = gow_html self.gow_marriage_page.url = 'http://magazine.gow.asia/love/column_details.php?column_uid=00000082' self.kanemoti_page = WebPage() kanemoti_file = open('test_support/kanemotilevel.html', encoding='utf-8') kanemoti_html = kanemoti_file.read() kanemoti_file.close() self.kanemoti_page.html_body = kanemoti_html
def load_html_files_with_query(query): pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open('%s_%s.html' % (query, str(i)), 'r') as f: page = WebPage() page.html_body = f.read() pages.append(page) return pages
def load_html_files_with_query(query): pages = [] for i in range(1): with open('%s.txt' % (query), 'r') as f: #ファイルの形式変更 page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages
def test_combine_nouns(self): page = WebPage() m_words = page.to_m_words('親子決戦試合') results = page.combine_nouns(m_words) self.assertEqual(results[0].name, '親子決戦試合') m_words = page.to_m_words('そして勝敗決定戦に') results = page.combine_nouns(m_words) self.assertEqual(results[1].name, '勝敗決定戦')
def _get_html_content_by_requests(self, url): # refer用的是base url w = WebPage(url, self.myconfig.url) try: w.fetch() c = w.getDatas()[1] return c except Exception, e: return ""
def yahoo_key_phrase(self, text): url = "http://jlp.yahooapis.jp/KeyphraseService/V1/extract?appid=%s&sentence=%s" % ( self.yahoo_japan_app_id, text, ) result_page = WebPage(url) result_page.fetch_xml() key_phrases = result_page.pick_key_phrases() return key_phrases
def load_html_files(): """HTMLの読み込み""" pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open('%s_%s.html' % (constants.QUERY, str(i)), 'r') as f: page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages
def test_find_pages(self): page_1 = WebPage("http://tradein.nissan.co.jp/") page_1.title = "自動車の下取りと売却" page_1.snippet = "自動車には下取りをする方法がけっこうある。" page_2 = WebPage("http://www.link-nexus.com/") page_2.title = "自動車の下取りと販売" page_2.snippet = "あばばばばば" page_3 = WebPage("http://toyota.jp/service/tradein/dc/top") page_3.title = "下取り参考価格情報" page_3.snippet = "下取りと販売ですよプロデューサーさん" search_engine = SearchEngine() search_engine.material_pages = [page_1, page_2, page_3] search_engine.hint_word = "自動車" search_engine.action_word = "下取り" search_engine.find_pages_including_related_words() self.assertEqual(search_engine.result_pages[0], page_1) self.assertEqual(search_engine.result_pages[1], page_2) self.assertEqual(search_engine.result_pages[2], page_3) search_engine.count_action_words() self.assertEqual(search_engine.action_words_count, {"販売": 2, "売却": 1}) search_engine.sort_action_words_count() self.assertEqual(search_engine.sorted_action_words, [{"word": "販売", "count": 2}, {"word": "売却", "count": 1}])
def test_part_of_task_clusters(self): page = WebPage(url='somewhere', query='カメラ 買う') page.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \ 'ヨドバシカメラに行く必要があります。お金を払ってください。' graph = self.build_graph([page]) answerer = TaskGraphFirstAnswerer(graph=graph, query_task='カメラ_買う') answerer.set_result_tasks() p_clusters = answerer.part_of_task_clusters self.assertEqual(p_clusters, [{'お金_払う', 'ヨドバシカメラ_行く'}]) i_clusters = answerer.instance_of_task_clusters self.assertEqual(i_clusters, [])
def load_html_files(): """ HTMLファイルがあるディレクトリにいる前提で使う """ pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open('%s.txt' % (constants.QUERY, str(i)), 'r') as f: page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages
def clue_web_search(self, query): """ 検索するだけ。 rows=50の値を変えることで検索結果件数を変えられる。 返り値にlistでtextsを渡す """ options = "&rows=50" url = constants.CLUE_WEB_URL_HEAD + query + options + constants.CLUE_WEB_URL_TAIL clue_web_result_page = WebPage(url) clue_web_result_page.fetch_xml() clue_web_result_page.pick_texts() return clue_web_result_page.texts
def load_html_files(): """ HTMLファイルがあるディレクトリにいる前提で使う """ pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open("%s_%s.html" % (constants.QUERY, str(i)), "r") as f: page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages
def setUp(self): # 適当なWebPage作って、Taskをsetする。 page_1 = WebPage(url='http://aaa.com', query='職業 質問する') page_1.text = '医師に質問してください。' page_2 = WebPage(url='http://bbb.com', query='職業 質問する') page_2.text = '看護師に質問してください。' page_3 = WebPage(url='http://ccc.com', query='職業 質問する') page_3.text = '理学療法士に質問してください。' self.graph = self.build_graph([page_1, page_2, page_3])
def result_pages(self, page_num=50): items = self._search(page_num) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) page.query = self.query #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def bing_search(self): key = my_keys.MICROSOFT_API_KEY bing = Bing(key) items = bing.web_search(self.query, 50, ['Title', 'Url', 'Description']) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) page.query = self.query #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def yahoo_sponsored_results(): query = request.forms.decode().get('query') #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) ads = y_ad_page.fetch_ads() v_and_s = [] for ad in ads: v_and_s.extend(ad.pick_verbs(ad.title)) v_and_s.extend(ad.pick_sahens(ad.title)) v_and_s.extend(ad.pick_verbs(ad.snippet)) v_and_s.extend(ad.pick_sahens(ad.snippet)) results = to_ranked_items(v_and_s) return ad_template.render(items=results)
def find_related_action_words_from_clueweb(self): self.set_solr_query() texts = self.clue_web_search(self.solr_query) # texts => ['大学入学', 'aaaa', ... ] 20 for text in texts: page = WebPage('unknown') self.add_to_results_if_key_phrase_present(text, page)
def test_slice_after_dots(self): page = WebPage() sentence_with_dots = 'あいうえお、かきくけこさしすせそ' result = page.slice_after_dots(sentence_with_dots) self.assertEqual(result, 'かきくけこさしすせそ') sentence_with_dots_2 = 'あいうえお、かきくけこ、さしすせそ' result = page.slice_after_dots(sentence_with_dots_2) self.assertEqual(result, 'さしすせそ') sentence_with_dots_3 = 'あいうえお、かきくけこ。さしすせそ' result = page.slice_after_dots(sentence_with_dots_3) self.assertEqual(result, 'さしすせそ') sentence_with_dots_4 = 'あいうえお。かきくけこ、さしすせそ' result = page.slice_after_dots(sentence_with_dots_4) self.assertEqual(result, 'さしすせそ')
def test_set_line_nums_around_action_word(self): result_page = WebPage() result_page.lines = ['aa', 'bbbb', 'ccccc', 'ddddd', 'aaaaa', 'eeeee'] result_page.set_line_nums_with_word('a') result_page.set_line_nums_around_action_word() self.assertEqual(result_page.line_nums_around_action_word, set([0, 1, 3, 4, 5]))
def test_set_descendants_with_simple_html(self): h1_node = Node('bbbbb<h2>cccc</h2>dddd', 'h1') h1_node.set_descendants() self.assertEqual(h1_node.children[0].html_body, 'dddd') self.assertEqual(h1_node.children[0].heading_type, 'h2') self.assertEqual(h1_node.children[0].heading_title, 'cccc') self.assertEqual(len(h1_node.children), 1) self.naver_hay_fever_page = WebPage( 'http://matome.naver.jp/topic/1LzuV')
def test_05_web(self): """Baidu.com""" switch_to_native(self.driver) web_po = WebPage(self.driver) web_po.baidu_button() sleep(3) insert_img(self.driver, 'baidu') switch_to_webview(self.driver) web_po.search_input('macaca') sleep(3) web_po.search_button()
def test_mashou_sentence(self): page = WebPage('http://home.e05.itscom.net/mizuki/masako/bedmake.htm') page.text = '1.トイレの便座も一度拭きましょう!' page.set_sentences_from_text() page.set_tasks_from_sentences() task = page.tasks[0] self.assertEqual(task.object_term.name, 'トイレの便座') self.assertEqual(task.predicate_term, '拭く')
def scrape_from_nanapi(): query = request.form['query'] head = 'http://nanapi.jp/search/q:' query_url = head + query nanapi_search_result_page = WebPage(query_url) nanapi_search_result_page.fetch_html() urls = nanapi_search_result_page.find_urls_from_nanapi_search_result() tasks = [] for url in urls: # result_pageはnanapiの1記事 result_page = WebPage(url) result_page.fetch_html() # task_steps => [task_step, task_step, ...] task = result_page.find_task_from_nanapi_with_headings() # task_steps[0].h2 => 'はじめに' # task_steps[0].h3s[0] => 'はじめに' tasks.append(task) # tasks => [task, task, ...] # tasks[0][0].h2 => 'はじめに' return render_template('nanapi_tasks.tmpl', tasks=tasks)
def search_in_clueweb_with_expanded_query(): search_engine = SearchEngine() search_engine.action_word = request.form['action_word'] search_engine.hint_word = request.form['hint_word'] search_engine.find_related_action_words_with_google() search_engine.count_action_words() search_engine.sort_action_words_count() search_engine.pick_sorted_action_words_more_than_1_count() results = [] for elem in search_engine.sorted_action_words_more_than_1_count: elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word'] url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml' web_page = WebPage(url) web_page.fetch_xml() web_page.pick_texts_to_result_pages() # クエリ1つごとに結果xmlページがある # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる for result_page in web_page.result_pages: # result_page.text_body result_page.set_lines_from_texts() result_page.set_line_nums_with_word(search_engine.action_word) result_page.set_line_nums_around_action_word() result_page.set_line_clusters_around_action_word() # web_page.result_pages[0].line_clusters_around_action_word results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']}) return render_template('search_in_clueweb_with_expanded_query.tmpl', results=results)
def find_matched_words_from_yahoo_ads(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() naradeha_results = [] bracket_words = [] for ad in y_ad_page.ads: ad.fetch_link_title() naradeha_results.extend(ad.pick_characteristic_words()) bracket_words.extend(ad.pick_bracket_words()) # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}] # bracket_words => ['アスコルビン酸', 'メルトダウン'] stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3'] for num in range(0, 10): stop_words.append(str(num)) results = naradeha_words_to_results(naradeha_results, stop_words) for bracket_word in bracket_words: is_including_stop_word = False for stop_word in stop_words: if stop_word in bracket_word: is_including_stop_word = True break if is_including_stop_word: continue results.append(bracket_word) return render_template('words.tmpl', words=results)
def google_search(self, query, num): url = "https://www.googleapis.com/customsearch/v1?" params = { "key": self.google_api_key, "q": query, "cx": "013036536707430787589:_pqjad5hr1a", "alt": "json", "lr": "lang_ja", } start = 1 items = [] for i in range(0, num): params["start"] = start request_url = url + urllib.parse.urlencode(params) try: response = urllib.request.urlopen(request_url) json_body = json.loads(response.read().decode("utf-8")) items.extend(json_body["items"]) if not "nextPage" in json_body["queries"]: break start = json_body["queries"]["nextPage"][0]["startIndex"] except: items.extend({"link": "#", "title": "検索できませんでした"}) # items => [{'link': 'http://...', 'title': 'ページは'}, {...}...] pages = [] for item in items: # if type(item) == str: continue page = WebPage(item["link"]) page.title = item["title"] page.snippet = item["snippet"] pages.append(page) # pages[0].link => 'http://...' # pages[0].title => 'ブログです' # pages[0].snippet => 'あたしは...' return pages
def query_expansion(): query = request.forms.decode().get('query') #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) ads = y_ad_page.fetch_ads() v_and_s = [] for ad in ads: v_and_s.extend(ad.pick_verbs(ad.title)) v_and_s.extend(ad.pick_sahens(ad.title)) v_and_s.extend(ad.pick_verbs(ad.snippet)) v_and_s.extend(ad.pick_sahens(ad.snippet)) ranked_items = to_ranked_items(v_and_s) ranked_items.insert(0, {'name': 'まとめ', 'count': 100}) normalized_query = normalize_query(query) query_words = normalized_query.split(' ') page_set = set() # set型は重複をなくすため expanded_queries = [] for item in ranked_items: top_5 = [] if item['name'] in query_words: #'花粉症 対策'で検索したら'対策'がitem['name']に入っていたりする continue else: expanded_query = normalized_query + ' ' + item['name'] expanded_queries.append(expanded_query) new_pages = search(expanded_query, 1) top_5 = new_pages[0:4] over_19 = False for one in top_5: page_set.add(one) if len(page_set) > 19: over_19 = True break if over_19 is True: break # => ranked_items内で回すのから脱出 return expand_template.render(pages=page_set, queries=expanded_queries)
def test_instance_of_task_clusters_exclude_part_of(self): page_1 = WebPage(url='somewhere', query='チョコレート 食べる') page_1.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \ 'ヨドバシカメラに行く必要があります。お金を払ってください。' page_2 = WebPage(url='elsewhere', query='チョコレート 食べる') page_2.text = '神社にお参りしてください。' page_3 = WebPage(url='anywhere', query='チョコレート 食べる') page_3.text = '神社にお参りしてください。' page_4 = WebPage(url='where', query='チョコレート 食べる') page_4.text = 'お金を払いましょう' graph = self.build_graph([page_1, page_2, page_3, page_4]) answerer = TaskGraphFirstAnswerer(graph=graph, query_task='チョコレート_食べる') answerer.set_result_tasks() i_clusters = answerer.instance_of_task_clusters self.assertEqual(i_clusters, [{'神社_お参りする'}])
class Ganji(object): def __init__(self): self.web_page = WebPage() pass def view_person_all(self, page=30): for i in range(page): page_url = r'http://bj.ganji.com/fang1/haidian/a1o{0}m1/'.format( i + 1) r = self.web_page.get(page_url) urls = module.get_post_urls(r.text) print urls for url in urls: r = self.web_page.get(url) if module.check_useful(r.text): module.show_link(url) def _get_page(self, url): r = self.web_page.get(url) if 'confirm' in r.url: pass return r pass
def test_set_clusters_around_action_word(self): result_page = WebPage() result_page.lines = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l' ] result_page.line_nums_around_action_word = set([0, 1, 3, 4, 5, 9, 10]) result_page.set_line_clusters_around_action_word() self.assertEqual(result_page.line_clusters_around_action_word, [['a', 'b'], ['d', 'e', 'f'], ['j', 'k']])
def bing_search(self, query, num): key = self.microsoft_api_key url = "https://api.datamarket.azure.com/Bing/Search/Web?" json_param = "&$format=json" param = {"Query": query} request_url = url + urllib.parse.urlencode(param) + json_param items = [] for i in range(0, num): try: json_body = requests.get(request_url, auth=(key, key), headers={"User-Agent": "My API Robot"}).json() items.extend(json_body["d"]["results"]) request_url = json_body["d"]["__next"] except: items.extend({"Url": "#", "Title": "検索できませんでした"}) pages = [] for item in items: if type(item) == str: continue page = WebPage(item["Url"]) # googleの書き方に統一 page.title = item["Title"] page.snippet = item["Description"] pages.append(page) return pages
def clue_web_search(self, query): """ 検索するだけ。 rows=50の値を変えることで検索結果件数を変えられる。 返り値にlistでtextsを渡す """ options = '&rows=50' url = constants.CLUE_WEB_URL_HEAD + query + options + constants.CLUE_WEB_URL_TAIL clue_web_result_page = WebPage(url) clue_web_result_page.fetch_xml() clue_web_result_page.pick_texts() return clue_web_result_page.texts
def start_check(): try: # 1. Open login page login_page = WebPage(url="%s%s" % (UKVISA_BASE_URL, LOGIN_PATH), handler=s) login_form_data = get_login_form_data(login_page.content()) # 2. Post login form and login :) login_form = create_login_form(login_form_data) res = login_form.submit().response() # 3. Open book appointment page book_appointment_page = WebPage(url=get_book_appointment_page_url(), handler=s) get_appointment_location_form = WebForm( action_url="%s%s" % (UKVISA_BASE_URL, APPOINTMENT_LOCATION_PATH), form_data=get_appointment_loc_form_data( book_appointment_page.content()), handler=s) res = get_appointment_location_form.submit().response() schedule_appointment_form_data = get_appointment_schedule_form_data( res.text) post_id = schedule_appointment_form_data['EnrolmentStationId'] available_dates = get_available_dates() print("\n" + datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")) if len(available_dates) > 0: appointment_data = { date: get_available_time_slots(post_id=post_id, appointment_date=date) for date in available_dates[:7] } print('Available dates found, generating message to be emailed') send_email(create_message(appointment_data)) else: print('No available dates for %s...' % (VISA_CENTRE)) #TODO: Need to figure out the captcha after submitting the form # submit_schedule_appointment_form(schedule_appointment_form_data); except Exception as e: print('Error checking visa appointment', e)
def yahoo_sponsored_results(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() result_words = [] key_phrases_of_ads = [] Engine = SearchEngine() for ad in y_ad_page.ads: result_words.extend(ad.pick_nouns_and_verbs(ad.title)) result_words.extend(ad.pick_nouns_and_verbs(ad.snippet)) #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.title)) #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.snippet)) results = to_ranked_items(result_words) #return ad_template.render(items=results) return render_template('find_words_with_yahoo_ads.tmpl', items=results)
def free_scraping_results(): url = request.forms.decode().get('url') page = WebPage(url) items = page.pick_something() return free_scraping_template.render(items=items)
from bing_api import Bing import os import constants from web_page import WebPage if __name__ == '__main__': bing = Bing() if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME): os.mkdir(constants.FETCHED_PAGES_DIR_NAME) os.chdir(constants.FETCHED_PAGES_DIR_NAME) results = bing.web_search(query=constants.QUERY, num_of_results=constants.NUM_OF_FETCHED_PAGES, keys=['Url']) for i, result in enumerate(results): page = WebPage(result['Url']) page.fetch_html() f = open('%s_%s.html' % (constants.QUERY, str(i)), 'w') f.write(page.html_body) f.close()
def test_set_line_nums_with_word(self): result_page = WebPage() result_page.lines = ['abc', 'bcd', 'cde'] result_page.set_line_nums_with_word('b') self.assertEqual(result_page.line_nums_with_action_word, set([0, 1]))