예제 #1
0
def find_matched_words_from_yahoo_ads():
    query = request.form['query']
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    y_ad_page.fetch_html()
    y_ad_page.fetch_ads()
    naradeha_results = []
    bracket_words = []
    for ad in y_ad_page.ads:
        ad.fetch_link_title()
        naradeha_results.extend(ad.pick_characteristic_words())
        bracket_words.extend(ad.pick_bracket_words())
    # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}]
    # bracket_words => ['アスコルビン酸', 'メルトダウン']

    stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3']
    for num in range(0, 10):
        stop_words.append(str(num))
    results = naradeha_words_to_results(naradeha_results, stop_words)

    for bracket_word in bracket_words:
        is_including_stop_word = False
        for stop_word in stop_words:
            if stop_word in bracket_word:
                is_including_stop_word = True
                break
        if is_including_stop_word:
            continue
        results.append(bracket_word)

    return render_template('words.tmpl', words=results)
예제 #2
0
def scrape_from_nanapi_and_build_heading_tree():
    query = request.form['query']
    head = 'http://nanapi.jp/search/q:'
    query_url = head + query
    nanapi_search_result_page = WebPage(query_url)
    nanapi_search_result_page.fetch_html()
    urls = nanapi_search_result_page.find_urls_from_nanapi_search_result()
    results = []
    for url in urls:
        # result_pageはnanapiの1記事
        result_page = WebPage(url)
        result_page.fetch_html()
        result_page.set_title()
        # task_steps => [task_step, task_step, ...]
        result_page.build_heading_tree()
        result = {'title': result_page.title, 'nodes': result_page.top_nodes, 'url': result_page.url}
        results.append(result)
    return render_template('headings_and_li_texts.tmpl', results=results)
예제 #3
0
def scrape_from_nanapi():
    query = request.form['query']
    head = 'http://nanapi.jp/search/q:'
    query_url = head + query
    nanapi_search_result_page = WebPage(query_url)
    nanapi_search_result_page.fetch_html()
    urls = nanapi_search_result_page.find_urls_from_nanapi_search_result()
    tasks = []
    for url in urls:
        # result_pageはnanapiの1記事
        result_page = WebPage(url)
        result_page.fetch_html()
        # task_steps => [task_step, task_step, ...]
        task = result_page.find_task_from_nanapi_with_headings()
        # task_steps[0].h2 => 'はじめに'
        # task_steps[0].h3s[0] => 'はじめに'
        tasks.append(task)
    # tasks => [task, task, ...]
    # tasks[0][0].h2 => 'はじめに'
    return render_template('nanapi_tasks.tmpl', tasks=tasks)
예제 #4
0
def yahoo_sponsored_results():
    query = request.form['query']
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    y_ad_page.fetch_html()
    y_ad_page.fetch_ads()
    result_words = []
    key_phrases_of_ads = []
    Engine = SearchEngine()
    for ad in y_ad_page.ads:
        result_words.extend(ad.pick_nouns_and_verbs(ad.title))
        result_words.extend(ad.pick_nouns_and_verbs(ad.snippet))
        #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.title))
        #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.snippet))
    results = to_ranked_items(result_words)
    #return ad_template.render(items=results)
    return render_template('find_words_with_yahoo_ads.tmpl',
        items=results)
from bing_api import Bing
import os
import constants
from web_page import WebPage

if __name__ == '__main__':
    bing = Bing()
    if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME):
        os.mkdir(constants.FETCHED_PAGES_DIR_NAME)
    os.chdir(constants.FETCHED_PAGES_DIR_NAME)
    results = bing.web_search(query=constants.QUERY,
            num_of_results=constants.NUM_OF_FETCHED_PAGES,
            keys=['Url'])
    for i, result in enumerate(results):
        page = WebPage(result['Url'])
        page.fetch_html()
        f = open('%s_%s.html' % (constants.QUERY, str(i)), 'w')
        f.write(page.html_body)
        f.close()
예제 #6
0
from bing_api import Bing
import os
import constants
from web_page import WebPage

if __name__ == '__main__':
    bing = Bing()
    if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME):
        os.mkdir(constants.FETCHED_PAGES_DIR_NAME)
    os.chdir(constants.FETCHED_PAGES_DIR_NAME)
    results = bing.web_search(query=constants.QUERY,
                              num_of_results=constants.NUM_OF_FETCHED_PAGES,
                              keys=['Url'])
    for i, result in enumerate(results):
        page = WebPage(result['Url'])
        page.fetch_html()
        f = open('%s_%s.html' % (constants.QUERY, str(i)), 'w')
        f.write(page.html_body)
        f.close()