예제 #1
0
    def google_search(self, query, num):
        NUM = num
        url = 'https://www.googleapis.com/customsearch/v1?'
        params = {
            'key': self.google_api_key,
            'q': query,
            'cx': '013036536707430787589:_pqjad5hr1a',
            'alt': 'json',
            'lr': 'lang_ja',
        }
        start = 1
        items = []

        for i in range(0, NUM):
            params['start'] = start
            request_url = url + urllib.parse.urlencode(params)
            try:
                response = urllib.request.urlopen(request_url)
                json_body = json.loads(response.read().decode('utf-8'))
                items.extend(json_body['items'])
                if not 'nextPage' in json_body['queries']:
                    break
                start = json_body['queries']['nextPage'][0]['startIndex']
            except:
                items.extend({'link': '#', 'title': '検索できませんでした'})
        pages = []
        for item in items:
            page = WebPage(item['link'])
            page.title = item['title']
            page.snippet = item['snippet']
            pages.append(page)
        return pages  # => [{'link': 'http://...', 'title': 'ページは'}, {...}...]
예제 #2
0
    def google_search(self, query, num):
        NUM = num
        url = 'https://www.googleapis.com/customsearch/v1?'
        params = {
            'key': self.google_api_key,
            'q': query,
            'cx': '013036536707430787589:_pqjad5hr1a',
            'alt': 'json',
            'lr': 'lang_ja',
        }
        start = 1
        items = []

        for i in range(0, NUM):
            params['start'] = start
            request_url = url + urllib.parse.urlencode(params)
            try:
                response = urllib.request.urlopen(request_url)
                json_body = json.loads(response.read().decode('utf-8'))
                items.extend(json_body['items'])
                if not 'nextPage' in json_body['queries']:
                    break
                start = json_body['queries']['nextPage'][0]['startIndex']
            except:
                items.extend({'link': '#', 'title': '検索できませんでした'})
        pages = []
        for item in items:
            page = WebPage(item['link'])
            page.title = item['title']
            page.snippet = item['snippet']
            pages.append(page)
        return pages  # => [{'link': 'http://...', 'title': 'ページは'}, {...}...]
예제 #3
0
 def bing_search(self, query, num):
     NUM = num
     key = self.microsoft_api_key
     url = 'https://api.datamarket.azure.com/Bing/Search/Web?'
     json_param = '&$format=json'
     param = {
         'Query': "'" + query + "'",
     }
     req_url = url + urllib.parse.urlencode(param)
     items = []
     for i in range(0, NUM):
         try:
             json_body = requests.get(req_url + json_param,
                                      auth=(key, key)).json()
             items.extend(json_body['d']['results'])
             req_url = json_body['d']['__next']
         except:
             items.extend({'Url': '#', 'Title': '検索できませんでした'})
     pages = []
     for item in items:
         page = WebPage(item['Url'])
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
예제 #4
0
 def yahoo_key_phrase(self, text):
     url = 'http://jlp.yahooapis.jp/KeyphraseService/V1/extract?appid=%s&sentence=%s' % (
         self.yahoo_japan_app_id, text)
     result_page = WebPage(url)
     result_page.fetch_xml()
     key_phrases = result_page.pick_key_phrases()
     return key_phrases
예제 #5
0
 def bing_search(self, query, num):
     key = self.microsoft_api_key
     url = 'https://api.datamarket.azure.com/Bing/Search/Web?'
     json_param = '&$format=json'
     param = {'Query': query}
     request_url = url + urllib.parse.urlencode(param) + json_param
     items = []
     for i in range(0, num):
         try:
             json_body = requests.get(request_url,
                                      auth=(key, key),
                                      headers={
                                          'User-Agent': 'My API Robot'
                                      }).json()
             items.extend(json_body['d']['results'])
             request_url = json_body['d']['__next']
         except:
             items.extend({'Url': '#', 'Title': '検索できませんでした'})
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
예제 #6
0
def get_book_appointment_page_url():
    user_applications_page = WebPage(url="%s%s" %
                                     (UKVISA_BASE_URL, USER_APPLICATIONS_PATH),
                                     handler=s)

    d = pq(user_applications_page.content())
    return "%s%s" % (UKVISA_BASE_URL, d('a').attr('href').replace('..', ''))
예제 #7
0
    def setUp(self):
        self.nanapi_article_page = WebPage()
        nanapi_file = open('test_support/nanapi.html', encoding='utf-8')
        nanapi_html = nanapi_file.read()
        nanapi_file.close()
        self.nanapi_article_page.html_body = nanapi_html
        self.nanapi_article_page.url = 'http://nanapi.jp'

        self.nanapi_hay_fever_page = WebPage('http://nanapi.jp')
        nanapi_hay_fever_file = open('test_support/nanapi_hay_fever.html',
                                     encoding='utf-8')
        nanapi_hay_fever_html = nanapi_hay_fever_file.read()
        nanapi_hay_fever_file.close()
        self.nanapi_hay_fever_page.html_body = nanapi_hay_fever_html

        self.gow_marriage_page = WebPage()
        gow_file = open('test_support/gow.html', encoding='utf-8')
        gow_html = gow_file.read()
        gow_file.close()
        self.gow_marriage_page.html_body = gow_html
        self.gow_marriage_page.url = 'http://magazine.gow.asia/love/column_details.php?column_uid=00000082'

        self.kanemoti_page = WebPage()
        kanemoti_file = open('test_support/kanemotilevel.html',
                             encoding='utf-8')
        kanemoti_html = kanemoti_file.read()
        kanemoti_file.close()
        self.kanemoti_page.html_body = kanemoti_html
예제 #8
0
 def bing_search(self, query, num):
     NUM = num
     key = self.microsoft_api_key
     url = 'https://api.datamarket.azure.com/Bing/Search/Web?'
     json_param = '&$format=json'
     param = {
         'Query': "'" + query + "'",
     }
     req_url = url + urllib.parse.urlencode(param)
     items = []
     for i in range(0, NUM):
         try:
             json_body = requests.get(req_url + json_param, auth=(key, key)).json()
             items.extend(json_body['d']['results'])
             req_url = json_body['d']['__next']
         except:
             items.extend({'Url': '#', 'Title': '検索できませんでした'})
     pages = []
     for item in items:
         page = WebPage(item['Url'])
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
예제 #9
0
def load_html_files_with_query(query):
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open('%s_%s.html' % (query, str(i)), 'r') as f:
            page = WebPage()
            page.html_body = f.read()
        pages.append(page)
    return pages
예제 #10
0
def load_html_files_with_query(query):
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open('%s_%s.html' % (query, str(i)), 'r') as f:
            page = WebPage()
            page.html_body = f.read()
        pages.append(page)
    return pages
예제 #11
0
파일: utils.py 프로젝트: NIGAYIM/jikken4
def load_html_files_with_query(query):
    pages = []
    for i in range(1):
        with open('%s.txt' % (query), 'r') as f:  #ファイルの形式変更
            page = WebPage()
            page.html_body = f.read()
        page.remove_html_tags()
        pages.append(page)
    return pages
예제 #12
0
    def test_combine_nouns(self):
        page = WebPage()
        m_words = page.to_m_words('親子決戦試合')
        results = page.combine_nouns(m_words)
        self.assertEqual(results[0].name, '親子決戦試合')

        m_words = page.to_m_words('そして勝敗決定戦に')
        results = page.combine_nouns(m_words)
        self.assertEqual(results[1].name, '勝敗決定戦')
예제 #13
0
 def _get_html_content_by_requests(self, url):
     # refer用的是base url
     w = WebPage(url, self.myconfig.url)
     try:
         w.fetch()
         c = w.getDatas()[1]
         return c
     except Exception, e:
         return ""
예제 #14
0
 def yahoo_key_phrase(self, text):
     url = "http://jlp.yahooapis.jp/KeyphraseService/V1/extract?appid=%s&sentence=%s" % (
         self.yahoo_japan_app_id,
         text,
     )
     result_page = WebPage(url)
     result_page.fetch_xml()
     key_phrases = result_page.pick_key_phrases()
     return key_phrases
def load_html_files():
    """HTMLの読み込み"""
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open('%s_%s.html' % (constants.QUERY, str(i)), 'r') as f:
            page = WebPage()
            page.html_body = f.read()
        page.remove_html_tags()
        pages.append(page)
    return pages
예제 #16
0
    def test_find_pages(self):
        page_1 = WebPage("http://tradein.nissan.co.jp/")
        page_1.title = "自動車の下取りと売却"
        page_1.snippet = "自動車には下取りをする方法がけっこうある。"

        page_2 = WebPage("http://www.link-nexus.com/")
        page_2.title = "自動車の下取りと販売"
        page_2.snippet = "あばばばばば"

        page_3 = WebPage("http://toyota.jp/service/tradein/dc/top")
        page_3.title = "下取り参考価格情報"
        page_3.snippet = "下取りと販売ですよプロデューサーさん"

        search_engine = SearchEngine()
        search_engine.material_pages = [page_1, page_2, page_3]
        search_engine.hint_word = "自動車"
        search_engine.action_word = "下取り"
        search_engine.find_pages_including_related_words()
        self.assertEqual(search_engine.result_pages[0], page_1)
        self.assertEqual(search_engine.result_pages[1], page_2)
        self.assertEqual(search_engine.result_pages[2], page_3)

        search_engine.count_action_words()
        self.assertEqual(search_engine.action_words_count, {"販売": 2, "売却": 1})

        search_engine.sort_action_words_count()
        self.assertEqual(search_engine.sorted_action_words, [{"word": "販売", "count": 2}, {"word": "売却", "count": 1}])
예제 #17
0
    def test_part_of_task_clusters(self):
        page = WebPage(url='somewhere', query='カメラ 買う')
        page.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \
                    'ヨドバシカメラに行く必要があります。お金を払ってください。'
        graph = self.build_graph([page])
        answerer = TaskGraphFirstAnswerer(graph=graph, query_task='カメラ_買う')
        answerer.set_result_tasks()
        p_clusters = answerer.part_of_task_clusters
        self.assertEqual(p_clusters, [{'お金_払う', 'ヨドバシカメラ_行く'}])

        i_clusters = answerer.instance_of_task_clusters
        self.assertEqual(i_clusters, [])
예제 #18
0
def load_html_files():
    """
    HTMLファイルがあるディレクトリにいる前提で使う
    """
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open('%s.txt' % (constants.QUERY, str(i)), 'r') as f:
            page = WebPage()
            page.html_body = f.read()
        page.remove_html_tags()
        pages.append(page)
    return pages
예제 #19
0
 def clue_web_search(self, query):
     """
     検索するだけ。
     rows=50の値を変えることで検索結果件数を変えられる。
     返り値にlistでtextsを渡す
     """
     options = "&rows=50"
     url = constants.CLUE_WEB_URL_HEAD + query + options + constants.CLUE_WEB_URL_TAIL
     clue_web_result_page = WebPage(url)
     clue_web_result_page.fetch_xml()
     clue_web_result_page.pick_texts()
     return clue_web_result_page.texts
def load_html_files():
    """
    HTMLファイルがあるディレクトリにいる前提で使う
    """
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open("%s_%s.html" % (constants.QUERY, str(i)), "r") as f:
            page = WebPage()
            page.html_body = f.read()
        page.remove_html_tags()
        pages.append(page)
    return pages
예제 #21
0
    def test_part_of_task_clusters(self):
        page = WebPage(url='somewhere', query='カメラ 買う')
        page.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \
                    'ヨドバシカメラに行く必要があります。お金を払ってください。'
        graph = self.build_graph([page])
        answerer = TaskGraphFirstAnswerer(graph=graph, query_task='カメラ_買う')
        answerer.set_result_tasks()
        p_clusters = answerer.part_of_task_clusters
        self.assertEqual(p_clusters, [{'お金_払う', 'ヨドバシカメラ_行く'}])

        i_clusters = answerer.instance_of_task_clusters
        self.assertEqual(i_clusters, [])
예제 #22
0
 def setUp(self):
     # 適当なWebPage作って、Taskをsetする。
     page_1 = WebPage(url='http://aaa.com', query='職業 質問する')
     page_1.text = '医師に質問してください。'
     page_2 = WebPage(url='http://bbb.com', query='職業 質問する')
     page_2.text = '看護師に質問してください。'
     page_3 = WebPage(url='http://ccc.com', query='職業 質問する')
     page_3.text = '理学療法士に質問してください。'
     self.graph = self.build_graph([page_1, page_2, page_3])
예제 #23
0
 def result_pages(self, page_num=50):
     items = self._search(page_num)
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         page.query = self.query
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
예제 #24
0
 def result_pages(self, page_num=50):
     items = self._search(page_num)
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         page.query = self.query
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
예제 #25
0
 def bing_search(self):
     key = my_keys.MICROSOFT_API_KEY
     bing = Bing(key)
     items = bing.web_search(self.query, 50, ['Title', 'Url', 'Description'])
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         page.query = self.query
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
예제 #26
0
def yahoo_sponsored_results():
    query = request.forms.decode().get('query')
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    ads = y_ad_page.fetch_ads()
    v_and_s = []
    for ad in ads:
        v_and_s.extend(ad.pick_verbs(ad.title))
        v_and_s.extend(ad.pick_sahens(ad.title))
        v_and_s.extend(ad.pick_verbs(ad.snippet))
        v_and_s.extend(ad.pick_sahens(ad.snippet))
    results = to_ranked_items(v_and_s)
    return ad_template.render(items=results)
예제 #27
0
 def find_related_action_words_from_clueweb(self):
     self.set_solr_query()
     texts = self.clue_web_search(self.solr_query)
     # texts => ['大学入学', 'aaaa', ... ] 20
     for text in texts:
         page = WebPage('unknown')
         self.add_to_results_if_key_phrase_present(text, page)
예제 #28
0
def yahoo_sponsored_results():
    query = request.forms.decode().get('query')
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    ads = y_ad_page.fetch_ads()
    v_and_s = []
    for ad in ads:
        v_and_s.extend(ad.pick_verbs(ad.title))
        v_and_s.extend(ad.pick_sahens(ad.title))
        v_and_s.extend(ad.pick_verbs(ad.snippet))
        v_and_s.extend(ad.pick_sahens(ad.snippet))
    results = to_ranked_items(v_and_s)
    return ad_template.render(items=results)
예제 #29
0
    def test_slice_after_dots(self):
        page = WebPage()
        sentence_with_dots = 'あいうえお、かきくけこさしすせそ'
        result = page.slice_after_dots(sentence_with_dots)
        self.assertEqual(result, 'かきくけこさしすせそ')

        sentence_with_dots_2 = 'あいうえお、かきくけこ、さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_2)
        self.assertEqual(result, 'さしすせそ')

        sentence_with_dots_3 = 'あいうえお、かきくけこ。さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_3)
        self.assertEqual(result, 'さしすせそ')

        sentence_with_dots_4 = 'あいうえお。かきくけこ、さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_4)
        self.assertEqual(result, 'さしすせそ')
예제 #30
0
 def test_set_line_nums_around_action_word(self):
     result_page = WebPage()
     result_page.lines = ['aa', 'bbbb', 'ccccc', 'ddddd', 'aaaaa', 'eeeee']
     result_page.set_line_nums_with_word('a')
     result_page.set_line_nums_around_action_word()
     self.assertEqual(result_page.line_nums_around_action_word,
                      set([0, 1, 3, 4, 5]))
예제 #31
0
 def test_set_descendants_with_simple_html(self):
     h1_node = Node('bbbbb<h2>cccc</h2>dddd', 'h1')
     h1_node.set_descendants()
     self.assertEqual(h1_node.children[0].html_body, 'dddd')
     self.assertEqual(h1_node.children[0].heading_type, 'h2')
     self.assertEqual(h1_node.children[0].heading_title, 'cccc')
     self.assertEqual(len(h1_node.children), 1)
     self.naver_hay_fever_page = WebPage(
         'http://matome.naver.jp/topic/1LzuV')
예제 #32
0
 def test_05_web(self):
     """Baidu.com"""
     switch_to_native(self.driver)
     web_po = WebPage(self.driver)
     web_po.baidu_button()
     sleep(3)
     insert_img(self.driver, 'baidu')
     switch_to_webview(self.driver)
     web_po.search_input('macaca')
     sleep(3)
     web_po.search_button()
예제 #33
0
 def test_mashou_sentence(self):
     page = WebPage('http://home.e05.itscom.net/mizuki/masako/bedmake.htm')
     page.text = '1.トイレの便座も一度拭きましょう!'
     page.set_sentences_from_text()
     page.set_tasks_from_sentences()
     task = page.tasks[0]
     self.assertEqual(task.object_term.name, 'トイレの便座')
     self.assertEqual(task.predicate_term, '拭く')
예제 #34
0
def scrape_from_nanapi():
    query = request.form['query']
    head = 'http://nanapi.jp/search/q:'
    query_url = head + query
    nanapi_search_result_page = WebPage(query_url)
    nanapi_search_result_page.fetch_html()
    urls = nanapi_search_result_page.find_urls_from_nanapi_search_result()
    tasks = []
    for url in urls:
        # result_pageはnanapiの1記事
        result_page = WebPage(url)
        result_page.fetch_html()
        # task_steps => [task_step, task_step, ...]
        task = result_page.find_task_from_nanapi_with_headings()
        # task_steps[0].h2 => 'はじめに'
        # task_steps[0].h3s[0] => 'はじめに'
        tasks.append(task)
    # tasks => [task, task, ...]
    # tasks[0][0].h2 => 'はじめに'
    return render_template('nanapi_tasks.tmpl', tasks=tasks)
예제 #35
0
def search_in_clueweb_with_expanded_query():
    search_engine = SearchEngine()
    search_engine.action_word = request.form['action_word']
    search_engine.hint_word = request.form['hint_word']
    search_engine.find_related_action_words_with_google()
    search_engine.count_action_words()
    search_engine.sort_action_words_count()
    search_engine.pick_sorted_action_words_more_than_1_count()
    results = []
    for elem in search_engine.sorted_action_words_more_than_1_count:
        elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word']
        url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml'
        web_page = WebPage(url)
        web_page.fetch_xml()
        web_page.pick_texts_to_result_pages()
        # クエリ1つごとに結果xmlページがある
        # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる
        for result_page in web_page.result_pages:
            # result_page.text_body
            result_page.set_lines_from_texts()
            result_page.set_line_nums_with_word(search_engine.action_word)
            result_page.set_line_nums_around_action_word()
            result_page.set_line_clusters_around_action_word()
        # web_page.result_pages[0].line_clusters_around_action_word
        results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']})
    return render_template('search_in_clueweb_with_expanded_query.tmpl',
        results=results)
예제 #36
0
def find_matched_words_from_yahoo_ads():
    query = request.form['query']
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    y_ad_page.fetch_html()
    y_ad_page.fetch_ads()
    naradeha_results = []
    bracket_words = []
    for ad in y_ad_page.ads:
        ad.fetch_link_title()
        naradeha_results.extend(ad.pick_characteristic_words())
        bracket_words.extend(ad.pick_bracket_words())
    # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}]
    # bracket_words => ['アスコルビン酸', 'メルトダウン']

    stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3']
    for num in range(0, 10):
        stop_words.append(str(num))
    results = naradeha_words_to_results(naradeha_results, stop_words)

    for bracket_word in bracket_words:
        is_including_stop_word = False
        for stop_word in stop_words:
            if stop_word in bracket_word:
                is_including_stop_word = True
                break
        if is_including_stop_word:
            continue
        results.append(bracket_word)

    return render_template('words.tmpl', words=results)
예제 #37
0
    def google_search(self, query, num):
        url = "https://www.googleapis.com/customsearch/v1?"
        params = {
            "key": self.google_api_key,
            "q": query,
            "cx": "013036536707430787589:_pqjad5hr1a",
            "alt": "json",
            "lr": "lang_ja",
        }
        start = 1
        items = []

        for i in range(0, num):
            params["start"] = start
            request_url = url + urllib.parse.urlencode(params)
            try:
                response = urllib.request.urlopen(request_url)
                json_body = json.loads(response.read().decode("utf-8"))
                items.extend(json_body["items"])
                if not "nextPage" in json_body["queries"]:
                    break
                start = json_body["queries"]["nextPage"][0]["startIndex"]
            except:
                items.extend({"link": "#", "title": "検索できませんでした"})
                # items => [{'link': 'http://...', 'title': 'ページは'}, {...}...]
        pages = []
        for item in items:
            #
            if type(item) == str:
                continue
            page = WebPage(item["link"])
            page.title = item["title"]
            page.snippet = item["snippet"]
            pages.append(page)
        # pages[0].link => 'http://...'
        # pages[0].title => 'ブログです'
        # pages[0].snippet => 'あたしは...'

        return pages
예제 #38
0
def query_expansion():
    query = request.forms.decode().get('query')
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    ads = y_ad_page.fetch_ads()
    v_and_s = []
    for ad in ads:
        v_and_s.extend(ad.pick_verbs(ad.title))
        v_and_s.extend(ad.pick_sahens(ad.title))
        v_and_s.extend(ad.pick_verbs(ad.snippet))
        v_and_s.extend(ad.pick_sahens(ad.snippet))
    ranked_items = to_ranked_items(v_and_s)
    ranked_items.insert(0, {'name': 'まとめ', 'count': 100})
    normalized_query = normalize_query(query)
    query_words = normalized_query.split(' ')
    page_set = set()  # set型は重複をなくすため
    expanded_queries = []
    for item in ranked_items:
        top_5 = []
        if item['name'] in query_words:
            #'花粉症 対策'で検索したら'対策'がitem['name']に入っていたりする
            continue
        else:
            expanded_query = normalized_query + ' ' + item['name']
            expanded_queries.append(expanded_query)
            new_pages = search(expanded_query, 1)
            top_5 = new_pages[0:4]
        over_19 = False
        for one in top_5:
            page_set.add(one)
            if len(page_set) > 19:
                over_19 = True
                break
        if over_19 is True:
            break  # => ranked_items内で回すのから脱出
    return expand_template.render(pages=page_set, queries=expanded_queries)
예제 #39
0
def query_expansion():
    query = request.forms.decode().get('query')
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    ads = y_ad_page.fetch_ads()
    v_and_s = []
    for ad in ads:
        v_and_s.extend(ad.pick_verbs(ad.title))
        v_and_s.extend(ad.pick_sahens(ad.title))
        v_and_s.extend(ad.pick_verbs(ad.snippet))
        v_and_s.extend(ad.pick_sahens(ad.snippet))
    ranked_items = to_ranked_items(v_and_s)
    ranked_items.insert(0, {'name': 'まとめ', 'count': 100})
    normalized_query = normalize_query(query)
    query_words = normalized_query.split(' ')
    page_set = set()  # set型は重複をなくすため
    expanded_queries = []
    for item in ranked_items:
        top_5 = []
        if item['name'] in query_words:
            #'花粉症 対策'で検索したら'対策'がitem['name']に入っていたりする
            continue
        else:
            expanded_query = normalized_query + ' ' + item['name']
            expanded_queries.append(expanded_query)
            new_pages = search(expanded_query, 1)
            top_5 = new_pages[0:4]
        over_19 = False
        for one in top_5:
            page_set.add(one)
            if len(page_set) > 19:
                over_19 = True
                break
        if over_19 is True:
            break  # => ranked_items内で回すのから脱出
    return expand_template.render(pages=page_set, queries=expanded_queries)
예제 #40
0
 def test_mashou_sentence(self):
     page = WebPage('http://home.e05.itscom.net/mizuki/masako/bedmake.htm')
     page.text = '1.トイレの便座も一度拭きましょう!'
     page.set_sentences_from_text()
     page.set_tasks_from_sentences()
     task = page.tasks[0]
     self.assertEqual(task.object_term.name, 'トイレの便座')
     self.assertEqual(task.predicate_term, '拭く')
예제 #41
0
 def test_instance_of_task_clusters_exclude_part_of(self):
     page_1 = WebPage(url='somewhere', query='チョコレート 食べる')
     page_1.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \
                   'ヨドバシカメラに行く必要があります。お金を払ってください。'
     page_2 = WebPage(url='elsewhere', query='チョコレート 食べる')
     page_2.text = '神社にお参りしてください。'
     page_3 = WebPage(url='anywhere', query='チョコレート 食べる')
     page_3.text = '神社にお参りしてください。'
     page_4 = WebPage(url='where', query='チョコレート 食べる')
     page_4.text = 'お金を払いましょう'
     graph = self.build_graph([page_1, page_2, page_3, page_4])
     answerer = TaskGraphFirstAnswerer(graph=graph, query_task='チョコレート_食べる')
     answerer.set_result_tasks()
     i_clusters = answerer.instance_of_task_clusters
     self.assertEqual(i_clusters, [{'神社_お参りする'}])
예제 #42
0
파일: ganji.py 프로젝트: 61--/crawler
class Ganji(object):
    def __init__(self):
        self.web_page = WebPage()
        pass

    def view_person_all(self, page=30):
        for i in range(page):
            page_url = r'http://bj.ganji.com/fang1/haidian/a1o{0}m1/'.format(
                i + 1)
            r = self.web_page.get(page_url)
            urls = module.get_post_urls(r.text)
            print urls
            for url in urls:
                r = self.web_page.get(url)
                if module.check_useful(r.text):
                    module.show_link(url)

    def _get_page(self, url):
        r = self.web_page.get(url)
        if 'confirm' in r.url:
            pass
        return r
        pass
예제 #43
0
    def test_combine_nouns(self):
        page = WebPage()
        m_words = page.to_m_words('親子決戦試合')
        results = page.combine_nouns(m_words)
        self.assertEqual(results[0].name, '親子決戦試合')

        m_words = page.to_m_words('そして勝敗決定戦に')
        results = page.combine_nouns(m_words)
        self.assertEqual(results[1].name, '勝敗決定戦')
예제 #44
0
 def test_set_clusters_around_action_word(self):
     result_page = WebPage()
     result_page.lines = [
         'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l'
     ]
     result_page.line_nums_around_action_word = set([0, 1, 3, 4, 5, 9, 10])
     result_page.set_line_clusters_around_action_word()
     self.assertEqual(result_page.line_clusters_around_action_word,
                      [['a', 'b'], ['d', 'e', 'f'], ['j', 'k']])
예제 #45
0
 def setUp(self):
     # 適当なWebPage作って、Taskをsetする。
     page_1 = WebPage(url='http://aaa.com', query='職業 質問する')
     page_1.text = '医師に質問してください。'
     page_2 = WebPage(url='http://bbb.com', query='職業 質問する')
     page_2.text = '看護師に質問してください。'
     page_3 = WebPage(url='http://ccc.com', query='職業 質問する')
     page_3.text = '理学療法士に質問してください。'
     self.graph = self.build_graph([page_1, page_2, page_3])
예제 #46
0
 def bing_search(self, query, num):
     key = self.microsoft_api_key
     url = "https://api.datamarket.azure.com/Bing/Search/Web?"
     json_param = "&$format=json"
     param = {"Query": query}
     request_url = url + urllib.parse.urlencode(param) + json_param
     items = []
     for i in range(0, num):
         try:
             json_body = requests.get(request_url, auth=(key, key), headers={"User-Agent": "My API Robot"}).json()
             items.extend(json_body["d"]["results"])
             request_url = json_body["d"]["__next"]
         except:
             items.extend({"Url": "#", "Title": "検索できませんでした"})
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item["Url"])
         # googleの書き方に統一
         page.title = item["Title"]
         page.snippet = item["Description"]
         pages.append(page)
     return pages
예제 #47
0
 def clue_web_search(self, query):
     """
     検索するだけ。
     rows=50の値を変えることで検索結果件数を変えられる。
     返り値にlistでtextsを渡す
     """
     options = '&rows=50'
     url = constants.CLUE_WEB_URL_HEAD + query + options + constants.CLUE_WEB_URL_TAIL
     clue_web_result_page = WebPage(url)
     clue_web_result_page.fetch_xml()
     clue_web_result_page.pick_texts()
     return clue_web_result_page.texts
예제 #48
0
def start_check():
    try:
        # 1. Open login page
        login_page = WebPage(url="%s%s" % (UKVISA_BASE_URL, LOGIN_PATH),
                             handler=s)

        login_form_data = get_login_form_data(login_page.content())

        # 2. Post login form and login :)
        login_form = create_login_form(login_form_data)
        res = login_form.submit().response()

        # 3. Open book appointment page
        book_appointment_page = WebPage(url=get_book_appointment_page_url(),
                                        handler=s)

        get_appointment_location_form = WebForm(
            action_url="%s%s" % (UKVISA_BASE_URL, APPOINTMENT_LOCATION_PATH),
            form_data=get_appointment_loc_form_data(
                book_appointment_page.content()),
            handler=s)

        res = get_appointment_location_form.submit().response()

        schedule_appointment_form_data = get_appointment_schedule_form_data(
            res.text)

        post_id = schedule_appointment_form_data['EnrolmentStationId']

        available_dates = get_available_dates()

        print("\n" + datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y"))
        if len(available_dates) > 0:
            appointment_data = {
                date: get_available_time_slots(post_id=post_id,
                                               appointment_date=date)
                for date in available_dates[:7]
            }
            print('Available dates found, generating message to be emailed')
            send_email(create_message(appointment_data))
        else:
            print('No available dates for %s...' % (VISA_CENTRE))

        #TODO: Need to figure out the captcha after submitting the form
        # submit_schedule_appointment_form(schedule_appointment_form_data);

    except Exception as e:
        print('Error checking visa appointment', e)
예제 #49
0
    def test_slice_after_dots(self):
        page = WebPage()
        sentence_with_dots = 'あいうえお、かきくけこさしすせそ'
        result = page.slice_after_dots(sentence_with_dots)
        self.assertEqual(result, 'かきくけこさしすせそ')

        sentence_with_dots_2 = 'あいうえお、かきくけこ、さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_2)
        self.assertEqual(result, 'さしすせそ')

        sentence_with_dots_3 = 'あいうえお、かきくけこ。さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_3)
        self.assertEqual(result, 'さしすせそ')

        sentence_with_dots_4 = 'あいうえお。かきくけこ、さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_4)
        self.assertEqual(result, 'さしすせそ')
예제 #50
0
def yahoo_sponsored_results():
    query = request.form['query']
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    y_ad_page.fetch_html()
    y_ad_page.fetch_ads()
    result_words = []
    key_phrases_of_ads = []
    Engine = SearchEngine()
    for ad in y_ad_page.ads:
        result_words.extend(ad.pick_nouns_and_verbs(ad.title))
        result_words.extend(ad.pick_nouns_and_verbs(ad.snippet))
        #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.title))
        #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.snippet))
    results = to_ranked_items(result_words)
    #return ad_template.render(items=results)
    return render_template('find_words_with_yahoo_ads.tmpl',
        items=results)
예제 #51
0
def free_scraping_results():
    url = request.forms.decode().get('url')
    page = WebPage(url)
    items = page.pick_something()
    return free_scraping_template.render(items=items)
from bing_api import Bing
import os
import constants
from web_page import WebPage

if __name__ == '__main__':
    bing = Bing()
    if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME):
        os.mkdir(constants.FETCHED_PAGES_DIR_NAME)
    os.chdir(constants.FETCHED_PAGES_DIR_NAME)
    results = bing.web_search(query=constants.QUERY,
            num_of_results=constants.NUM_OF_FETCHED_PAGES,
            keys=['Url'])
    for i, result in enumerate(results):
        page = WebPage(result['Url'])
        page.fetch_html()
        f = open('%s_%s.html' % (constants.QUERY, str(i)), 'w')
        f.write(page.html_body)
        f.close()
예제 #53
0
from bing_api import Bing
import os
import constants
from web_page import WebPage

if __name__ == '__main__':
    bing = Bing()
    if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME):
        os.mkdir(constants.FETCHED_PAGES_DIR_NAME)
    os.chdir(constants.FETCHED_PAGES_DIR_NAME)
    results = bing.web_search(query=constants.QUERY,
                              num_of_results=constants.NUM_OF_FETCHED_PAGES,
                              keys=['Url'])
    for i, result in enumerate(results):
        page = WebPage(result['Url'])
        page.fetch_html()
        f = open('%s_%s.html' % (constants.QUERY, str(i)), 'w')
        f.write(page.html_body)
        f.close()
예제 #54
0
 def test_set_line_nums_with_word(self):
     result_page = WebPage()
     result_page.lines = ['abc', 'bcd', 'cde']
     result_page.set_line_nums_with_word('b')
     self.assertEqual(result_page.line_nums_with_action_word, set([0, 1]))
예제 #55
0
 def test_set_line_nums_around_action_word(self):
     result_page = WebPage()
     result_page.lines = ['aa', 'bbbb', 'ccccc', 'ddddd', 'aaaaa', 'eeeee']
     result_page.set_line_nums_with_word('a')
     result_page.set_line_nums_around_action_word()
     self.assertEqual(result_page.line_nums_around_action_word, set([0, 1, 3, 4, 5]))