def setUp(self): self.nanapi_article_page = WebPage() nanapi_file = open('test_support/nanapi.html', encoding='utf-8') nanapi_html = nanapi_file.read() nanapi_file.close() self.nanapi_article_page.html_body = nanapi_html self.nanapi_article_page.url = 'http://nanapi.jp' self.nanapi_hay_fever_page = WebPage('http://nanapi.jp') nanapi_hay_fever_file = open('test_support/nanapi_hay_fever.html', encoding='utf-8') nanapi_hay_fever_html = nanapi_hay_fever_file.read() nanapi_hay_fever_file.close() self.nanapi_hay_fever_page.html_body = nanapi_hay_fever_html self.gow_marriage_page = WebPage() gow_file = open('test_support/gow.html', encoding='utf-8') gow_html = gow_file.read() gow_file.close() self.gow_marriage_page.html_body = gow_html self.gow_marriage_page.url = 'http://magazine.gow.asia/love/column_details.php?column_uid=00000082' self.kanemoti_page = WebPage() kanemoti_file = open('test_support/kanemotilevel.html', encoding='utf-8') kanemoti_html = kanemoti_file.read() kanemoti_file.close() self.kanemoti_page.html_body = kanemoti_html
def test_find_pages(self): page_1 = WebPage('http://tradein.nissan.co.jp/') page_1.title = '自動車の下取りと売却' page_1.snippet = '自動車には下取りをする方法がけっこうある。' page_2 = WebPage('http://www.link-nexus.com/') page_2.title = '自動車の下取りと販売' page_2.snippet = 'あばばばばば' page_3 = WebPage('http://toyota.jp/service/tradein/dc/top') page_3.title = '下取り参考価格情報' page_3.snippet = '下取りと販売ですよプロデューサーさん' search_engine = SearchEngine() search_engine.material_pages = [page_1, page_2, page_3] search_engine.hint_word = '自動車' search_engine.action_word = '下取り' search_engine.find_pages_including_related_words() self.assertEqual(search_engine.result_pages[0], page_1) self.assertEqual(search_engine.result_pages[1], page_2) self.assertEqual(search_engine.result_pages[2], page_3) search_engine.count_action_words() self.assertEqual(search_engine.action_words_count, {'販売': 2, '売却': 1}) search_engine.sort_action_words_count() self.assertEqual(search_engine.sorted_action_words, [{ 'word': '販売', 'count': 2 }, { 'word': '売却', 'count': 1 }])
def setUp(self): # 適当なWebPage作って、Taskをsetする。 page_1 = WebPage(url='http://aaa.com', query='職業 質問する') page_1.text = '医師に質問してください。' page_2 = WebPage(url='http://bbb.com', query='職業 質問する') page_2.text = '看護師に質問してください。' page_3 = WebPage(url='http://ccc.com', query='職業 質問する') page_3.text = '理学療法士に質問してください。' self.graph = self.build_graph([page_1, page_2, page_3])
def test_instance_of_task_clusters_exclude_part_of(self): page_1 = WebPage(url='somewhere', query='チョコレート 食べる') page_1.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \ 'ヨドバシカメラに行く必要があります。お金を払ってください。' page_2 = WebPage(url='elsewhere', query='チョコレート 食べる') page_2.text = '神社にお参りしてください。' page_3 = WebPage(url='anywhere', query='チョコレート 食べる') page_3.text = '神社にお参りしてください。' page_4 = WebPage(url='where', query='チョコレート 食べる') page_4.text = 'お金を払いましょう' graph = self.build_graph([page_1, page_2, page_3, page_4]) answerer = TaskGraphFirstAnswerer(graph=graph, query_task='チョコレート_食べる') answerer.set_result_tasks() i_clusters = answerer.instance_of_task_clusters self.assertEqual(i_clusters, [{'神社_お参りする'}])
def start_check(): try: # 1. Open login page login_page = WebPage(url="%s%s" % (UKVISA_BASE_URL, LOGIN_PATH), handler=s) login_form_data = get_login_form_data(login_page.content()) # 2. Post login form and login :) login_form = create_login_form(login_form_data) res = login_form.submit().response() # 3. Open book appointment page book_appointment_page = WebPage(url=get_book_appointment_page_url(), handler=s) get_appointment_location_form = WebForm( action_url="%s%s" % (UKVISA_BASE_URL, APPOINTMENT_LOCATION_PATH), form_data=get_appointment_loc_form_data( book_appointment_page.content()), handler=s) res = get_appointment_location_form.submit().response() schedule_appointment_form_data = get_appointment_schedule_form_data( res.text) post_id = schedule_appointment_form_data['EnrolmentStationId'] available_dates = get_available_dates() print("\n" + datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")) if len(available_dates) > 0: appointment_data = { date: get_available_time_slots(post_id=post_id, appointment_date=date) for date in available_dates[:7] } print('Available dates found, generating message to be emailed') send_email(create_message(appointment_data)) else: print('No available dates for %s...' % (VISA_CENTRE)) #TODO: Need to figure out the captcha after submitting the form # submit_schedule_appointment_form(schedule_appointment_form_data); except Exception as e: print('Error checking visa appointment', e)
def google_search(self, query, num): NUM = num url = 'https://www.googleapis.com/customsearch/v1?' params = { 'key': self.google_api_key, 'q': query, 'cx': '013036536707430787589:_pqjad5hr1a', 'alt': 'json', 'lr': 'lang_ja', } start = 1 items = [] for i in range(0, NUM): params['start'] = start request_url = url + urllib.parse.urlencode(params) try: response = urllib.request.urlopen(request_url) json_body = json.loads(response.read().decode('utf-8')) items.extend(json_body['items']) if not 'nextPage' in json_body['queries']: break start = json_body['queries']['nextPage'][0]['startIndex'] except: items.extend({'link': '#', 'title': '検索できませんでした'}) pages = [] for item in items: page = WebPage(item['link']) page.title = item['title'] page.snippet = item['snippet'] pages.append(page) return pages # => [{'link': 'http://...', 'title': 'ページは'}, {...}...]
def bing_search(self, query, num): key = self.microsoft_api_key url = 'https://api.datamarket.azure.com/Bing/Search/Web?' json_param = '&$format=json' param = {'Query': query} request_url = url + urllib.parse.urlencode(param) + json_param items = [] for i in range(0, num): try: json_body = requests.get(request_url, auth=(key, key), headers={ 'User-Agent': 'My API Robot' }).json() items.extend(json_body['d']['results']) request_url = json_body['d']['__next'] except: items.extend({'Url': '#', 'Title': '検索できませんでした'}) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def yahoo_key_phrase(self, text): url = 'http://jlp.yahooapis.jp/KeyphraseService/V1/extract?appid=%s&sentence=%s' % ( self.yahoo_japan_app_id, text) result_page = WebPage(url) result_page.fetch_xml() key_phrases = result_page.pick_key_phrases() return key_phrases
def get_book_appointment_page_url(): user_applications_page = WebPage(url="%s%s" % (UKVISA_BASE_URL, USER_APPLICATIONS_PATH), handler=s) d = pq(user_applications_page.content()) return "%s%s" % (UKVISA_BASE_URL, d('a').attr('href').replace('..', ''))
def search_in_clueweb_with_expanded_query(): search_engine = SearchEngine() search_engine.action_word = request.form['action_word'] search_engine.hint_word = request.form['hint_word'] search_engine.find_related_action_words_with_google() search_engine.count_action_words() search_engine.sort_action_words_count() search_engine.pick_sorted_action_words_more_than_1_count() results = [] for elem in search_engine.sorted_action_words_more_than_1_count: elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word'] url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml' web_page = WebPage(url) web_page.fetch_xml() web_page.pick_texts_to_result_pages() # クエリ1つごとに結果xmlページがある # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる for result_page in web_page.result_pages: # result_page.text_body result_page.set_lines_from_texts() result_page.set_line_nums_with_word(search_engine.action_word) result_page.set_line_nums_around_action_word() result_page.set_line_clusters_around_action_word() # web_page.result_pages[0].line_clusters_around_action_word results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']}) return render_template('search_in_clueweb_with_expanded_query.tmpl', results=results)
def find_matched_words_from_yahoo_ads(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() naradeha_results = [] bracket_words = [] for ad in y_ad_page.ads: ad.fetch_link_title() naradeha_results.extend(ad.pick_characteristic_words()) bracket_words.extend(ad.pick_bracket_words()) # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}] # bracket_words => ['アスコルビン酸', 'メルトダウン'] stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3'] for num in range(0, 10): stop_words.append(str(num)) results = naradeha_words_to_results(naradeha_results, stop_words) for bracket_word in bracket_words: is_including_stop_word = False for stop_word in stop_words: if stop_word in bracket_word: is_including_stop_word = True break if is_including_stop_word: continue results.append(bracket_word) return render_template('words.tmpl', words=results)
def bing_search(self, query, num): NUM = num key = self.microsoft_api_key url = 'https://api.datamarket.azure.com/Bing/Search/Web?' json_param = '&$format=json' param = { 'Query': "'" + query + "'", } req_url = url + urllib.parse.urlencode(param) items = [] for i in range(0, NUM): try: json_body = requests.get(req_url + json_param, auth=(key, key)).json() items.extend(json_body['d']['results']) req_url = json_body['d']['__next'] except: items.extend({'Url': '#', 'Title': '検索できませんでした'}) pages = [] for item in items: page = WebPage(item['Url']) #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def find_related_action_words_from_clueweb(self): self.set_solr_query() texts = self.clue_web_search(self.solr_query) # texts => ['大学入学', 'aaaa', ... ] 20 for text in texts: page = WebPage('unknown') self.add_to_results_if_key_phrase_present(text, page)
def test_set_line_nums_around_action_word(self): result_page = WebPage() result_page.lines = ['aa', 'bbbb', 'ccccc', 'ddddd', 'aaaaa', 'eeeee'] result_page.set_line_nums_with_word('a') result_page.set_line_nums_around_action_word() self.assertEqual(result_page.line_nums_around_action_word, set([0, 1, 3, 4, 5]))
def test_mashou_sentence(self): page = WebPage('http://home.e05.itscom.net/mizuki/masako/bedmake.htm') page.text = '1.トイレの便座も一度拭きましょう!' page.set_sentences_from_text() page.set_tasks_from_sentences() task = page.tasks[0] self.assertEqual(task.object_term.name, 'トイレの便座') self.assertEqual(task.predicate_term, '拭く')
def load_html_files_with_query(query): pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open('%s_%s.html' % (query, str(i)), 'r') as f: page = WebPage() page.html_body = f.read() pages.append(page) return pages
def test_combine_nouns(self): page = WebPage() m_words = page.to_m_words('親子決戦試合') results = page.combine_nouns(m_words) self.assertEqual(results[0].name, '親子決戦試合') m_words = page.to_m_words('そして勝敗決定戦に') results = page.combine_nouns(m_words) self.assertEqual(results[1].name, '勝敗決定戦')
def test_set_descendants_with_simple_html(self): h1_node = Node('bbbbb<h2>cccc</h2>dddd', 'h1') h1_node.set_descendants() self.assertEqual(h1_node.children[0].html_body, 'dddd') self.assertEqual(h1_node.children[0].heading_type, 'h2') self.assertEqual(h1_node.children[0].heading_title, 'cccc') self.assertEqual(len(h1_node.children), 1) self.naver_hay_fever_page = WebPage( 'http://matome.naver.jp/topic/1LzuV')
def test_set_clusters_around_action_word(self): result_page = WebPage() result_page.lines = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l' ] result_page.line_nums_around_action_word = set([0, 1, 3, 4, 5, 9, 10]) result_page.set_line_clusters_around_action_word() self.assertEqual(result_page.line_clusters_around_action_word, [['a', 'b'], ['d', 'e', 'f'], ['j', 'k']])
def load_html_files_with_query(query): pages = [] for i in range(1): with open('%s.txt' % (query), 'r') as f: #ファイルの形式変更 page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages
def scrape_from_nanapi_and_build_heading_tree(): query = request.form['query'] head = 'http://nanapi.jp/search/q:' query_url = head + query nanapi_search_result_page = WebPage(query_url) nanapi_search_result_page.fetch_html() urls = nanapi_search_result_page.find_urls_from_nanapi_search_result() results = [] for url in urls: # result_pageはnanapiの1記事 result_page = WebPage(url) result_page.fetch_html() result_page.set_title() # task_steps => [task_step, task_step, ...] result_page.build_heading_tree() result = {'title': result_page.title, 'nodes': result_page.top_nodes, 'url': result_page.url} results.append(result) return render_template('headings_and_li_texts.tmpl', results=results)
def scrape_from_nanapi(): query = request.form['query'] head = 'http://nanapi.jp/search/q:' query_url = head + query nanapi_search_result_page = WebPage(query_url) nanapi_search_result_page.fetch_html() urls = nanapi_search_result_page.find_urls_from_nanapi_search_result() tasks = [] for url in urls: # result_pageはnanapiの1記事 result_page = WebPage(url) result_page.fetch_html() # task_steps => [task_step, task_step, ...] task = result_page.find_task_from_nanapi_with_headings() # task_steps[0].h2 => 'はじめに' # task_steps[0].h3s[0] => 'はじめに' tasks.append(task) # tasks => [task, task, ...] # tasks[0][0].h2 => 'はじめに' return render_template('nanapi_tasks.tmpl', tasks=tasks)
def test_05_web(self): """Baidu.com""" switch_to_native(self.driver) web_po = WebPage(self.driver) web_po.baidu_button() sleep(3) insert_img(self.driver, 'baidu') switch_to_webview(self.driver) web_po.search_input('macaca') sleep(3) web_po.search_button()
def load_html_files(): """ HTMLファイルがあるディレクトリにいる前提で使う """ pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open('%s.txt' % (constants.QUERY, str(i)), 'r') as f: page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages
def clue_web_search(self, query): """ 検索するだけ。 rows=50の値を変えることで検索結果件数を変えられる。 返り値にlistでtextsを渡す """ options = '&rows=50' url = constants.CLUE_WEB_URL_HEAD + query + options + constants.CLUE_WEB_URL_TAIL clue_web_result_page = WebPage(url) clue_web_result_page.fetch_xml() clue_web_result_page.pick_texts() return clue_web_result_page.texts
def test_part_of_task_clusters(self): page = WebPage(url='somewhere', query='カメラ 買う') page.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \ 'ヨドバシカメラに行く必要があります。お金を払ってください。' graph = self.build_graph([page]) answerer = TaskGraphFirstAnswerer(graph=graph, query_task='カメラ_買う') answerer.set_result_tasks() p_clusters = answerer.part_of_task_clusters self.assertEqual(p_clusters, [{'お金_払う', 'ヨドバシカメラ_行く'}]) i_clusters = answerer.instance_of_task_clusters self.assertEqual(i_clusters, [])
def result_pages(self, page_num=50): items = self._search(page_num) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) page.query = self.query #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def bing_search(self): key = my_keys.MICROSOFT_API_KEY bing = Bing(key) items = bing.web_search(self.query, 50, ['Title', 'Url', 'Description']) pages = [] for item in items: if type(item) == str: continue page = WebPage(item['Url']) page.query = self.query #googleの書き方に統一 page.title = item['Title'] page.snippet = item['Description'] pages.append(page) return pages
def yahoo_sponsored_results(): query = request.forms.decode().get('query') #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) ads = y_ad_page.fetch_ads() v_and_s = [] for ad in ads: v_and_s.extend(ad.pick_verbs(ad.title)) v_and_s.extend(ad.pick_sahens(ad.title)) v_and_s.extend(ad.pick_verbs(ad.snippet)) v_and_s.extend(ad.pick_sahens(ad.snippet)) results = to_ranked_items(v_and_s) return ad_template.render(items=results)
def test_slice_after_dots(self): page = WebPage() sentence_with_dots = 'あいうえお、かきくけこさしすせそ' result = page.slice_after_dots(sentence_with_dots) self.assertEqual(result, 'かきくけこさしすせそ') sentence_with_dots_2 = 'あいうえお、かきくけこ、さしすせそ' result = page.slice_after_dots(sentence_with_dots_2) self.assertEqual(result, 'さしすせそ') sentence_with_dots_3 = 'あいうえお、かきくけこ。さしすせそ' result = page.slice_after_dots(sentence_with_dots_3) self.assertEqual(result, 'さしすせそ') sentence_with_dots_4 = 'あいうえお。かきくけこ、さしすせそ' result = page.slice_after_dots(sentence_with_dots_4) self.assertEqual(result, 'さしすせそ')