def find_matched_words_from_yahoo_ads(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() naradeha_results = [] bracket_words = [] for ad in y_ad_page.ads: ad.fetch_link_title() naradeha_results.extend(ad.pick_characteristic_words()) bracket_words.extend(ad.pick_bracket_words()) # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}] # bracket_words => ['アスコルビン酸', 'メルトダウン'] stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3'] for num in range(0, 10): stop_words.append(str(num)) results = naradeha_words_to_results(naradeha_results, stop_words) for bracket_word in bracket_words: is_including_stop_word = False for stop_word in stop_words: if stop_word in bracket_word: is_including_stop_word = True break if is_including_stop_word: continue results.append(bracket_word) return render_template('words.tmpl', words=results)
def scrape_from_nanapi_and_build_heading_tree(): query = request.form['query'] head = 'http://nanapi.jp/search/q:' query_url = head + query nanapi_search_result_page = WebPage(query_url) nanapi_search_result_page.fetch_html() urls = nanapi_search_result_page.find_urls_from_nanapi_search_result() results = [] for url in urls: # result_pageはnanapiの1記事 result_page = WebPage(url) result_page.fetch_html() result_page.set_title() # task_steps => [task_step, task_step, ...] result_page.build_heading_tree() result = {'title': result_page.title, 'nodes': result_page.top_nodes, 'url': result_page.url} results.append(result) return render_template('headings_and_li_texts.tmpl', results=results)
def scrape_from_nanapi(): query = request.form['query'] head = 'http://nanapi.jp/search/q:' query_url = head + query nanapi_search_result_page = WebPage(query_url) nanapi_search_result_page.fetch_html() urls = nanapi_search_result_page.find_urls_from_nanapi_search_result() tasks = [] for url in urls: # result_pageはnanapiの1記事 result_page = WebPage(url) result_page.fetch_html() # task_steps => [task_step, task_step, ...] task = result_page.find_task_from_nanapi_with_headings() # task_steps[0].h2 => 'はじめに' # task_steps[0].h3s[0] => 'はじめに' tasks.append(task) # tasks => [task, task, ...] # tasks[0][0].h2 => 'はじめに' return render_template('nanapi_tasks.tmpl', tasks=tasks)
def yahoo_sponsored_results(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() result_words = [] key_phrases_of_ads = [] Engine = SearchEngine() for ad in y_ad_page.ads: result_words.extend(ad.pick_nouns_and_verbs(ad.title)) result_words.extend(ad.pick_nouns_and_verbs(ad.snippet)) #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.title)) #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.snippet)) results = to_ranked_items(result_words) #return ad_template.render(items=results) return render_template('find_words_with_yahoo_ads.tmpl', items=results)
from bing_api import Bing import os import constants from web_page import WebPage if __name__ == '__main__': bing = Bing() if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME): os.mkdir(constants.FETCHED_PAGES_DIR_NAME) os.chdir(constants.FETCHED_PAGES_DIR_NAME) results = bing.web_search(query=constants.QUERY, num_of_results=constants.NUM_OF_FETCHED_PAGES, keys=['Url']) for i, result in enumerate(results): page = WebPage(result['Url']) page.fetch_html() f = open('%s_%s.html' % (constants.QUERY, str(i)), 'w') f.write(page.html_body) f.close()