Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_pages', type=int)
    parser.add_argument('--update_tag_db', '-u', help='Upsert scraped data to database',
                        required=False, action='store_true')
    args = parser.parse_args()

    db = Database()
    if db.is_empty('Tag') or args.update_tag_db:
        tag_parser = PageParser("tag")
        tags = tag_parser.get_pages(Tag, MAX)
        db.update_tag_table(tags)

    print("Getting Question Summaries...")
    summary_parser = PageParser("question_summary")
    summaries = summary_parser.get_pages(QuestionSummary, args.num_pages)

    print("Getting Articles...")
    article_parser = APIParser()
    articles = article_parser.get_responses(Article, summaries)

    #Enrich Question Summary with articles
    for question_summary, articles_list in zip(summaries, articles):
        question_summary.add_articles(articles_list)

    print("Populating DB...")
    db.insert_question_summaries(summaries)
Пример #2
0
    def test_url_read(self):
        """
        测试了三个场景:
        使用标准url
        使用无效url
        使用其他格式的url文档,如jpg
        :return:
        """
        url1 = 'localhost:8081/page1.html'
        parser = PageParser(url1)
        content1 = parser.url_read()
        self.assertEqual(content1.__contains__('page1_4.html'), True)
        self.assertEqual(content1.__contains__('page1_1.html'), True)

        #invalid url test
        url2 = 'localhost:8081/page7.html'
        parser = PageParser(url2)
        content2 = parser.url_read()
        self.assertEqual(content2, '', "return content should be empty")

        #No support url test
        url3 = 'localhost:8081/3/image.jpg'
        parser = PageParser(url3)
        content3 = parser.url_read()
        self.assertEqual(content3, '')
        self.assertLogs(logger='../logs/spider.log', level='error')
Пример #3
0
def handle_link_shared(event):
    unfurls = {}
    for link in event.get('links'):
        url = link.get('url')
        origin = requests.get(url)
        p = PageParser()
        p.feed(origin.text)
        p.close()

        if p.content_type == PageParser.IMAGE:
            unfurls[url] = {
                'text': 'image',
                'image_url': p.content
            }
    response = requests.post('https://slack.com/api/chat.unfurl',
                             json={
                                 'token': WEB_API_TOKEN,
                                 'channel': event.get('channel'),
                                 'ts': event.get('message_ts'),
                                 'unfurls': unfurls
                             },
                             headers={
                                 'Content-type': 'application/json;charset=utf-8',
                                 'Authorization': 'Bearer %s' % WEB_API_TOKEN
                             })

    print('unfurl %s' % response.text)
    return('Done')
Пример #4
0
    def test_follow_succeeded(self):
        driver = self.driver

        button_getter = ButtonGetter(driver)
        page_parser = PageParser(driver)

        follow_buttons = button_getter.get_follow_buttons()

        followed_title = follow_buttons[FOLLOWING_INDEX].get_attribute(
            'data-tumblelog-name')
        follow_buttons[FOLLOWING_INDEX].send_keys(Keys.RETURN)  #click

        self.driver.get(URL + "/following")

        assert followed_title in page_parser.get_following()

        button_getter.get_unfollow_button(followed_title).click()

        time.sleep(1)

        button_getter.get_ok_button().click()

        time.sleep(1)

        self.driver.refresh()

        assert followed_title not in page_parser.get_following()
    def __init__(self, month, year, threshold):
        self.const_one_month_in_days = 30

        self.article_creation_threshold = threshold
        self.threshold_very_active = 100
        self.threshold_active = 5
        self.const_very_active = "Very Active"
        self.const_active = "Active"
        self.const_not_active = "Not Active"
        self.month = month
        self.year = year

        self.const_max_requests = 500
        self.url_userinfo = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=users"
        self.url_usercontb = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=usercontribs&"
        self.url_propages = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=prefixsearch&"
        self.url_contributors = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=contributors&"

        self.debug = False

        self.list_editors_sorted = []
        self.editors_WIR = {}
        self.dict_editor_article = {}
        self.dict_editor_infoboxes = {}

        self.set_members = set()
        self.parser = PageParser()
        self.tabler = TableGenerator()
Пример #6
0
 def parse_file(self, url, page_file):
     try:
         self.children = set()
         parser = PageParser(self, url)
         parser.parse_links(page_file)
     finally:
         page_file.close()
Пример #7
0
def get_new_state(driver, old_state):
    parser = PageParser(driver)
    state_elements = []
    live_elements = parser.get_all_elements()
    #first look for the old elements that are still present
    for element in old_state.elements:
        webelement = WebElement(driver, element.locators)
        if webelement.is_present(1):
            webelement.highlight(color="green")
            if webelement in live_elements:
                state_elements.append(element)
                live_elements.remove(webelement)

    #look for any changed elements
    for element in live_elements:
        element.highligbht(color="blue")
        new_element_state = element_builder.build_element(driver, element)
        if new_element_state is None:
            logging.error("No locators found for element %s" % element)
        else:
            state_elements.append(new_element_state)

    return State(elements=state_elements,
                 url=driver.current_url,
                 html=driver.html,
                 screenshot=driver.get_screenshot_as_base64())
Пример #8
0
 def __init__(self, base_url):
     self._base_url = base_url
     self._page_parser = PageParser()
     try:
         self._mongo_wrapper = MongoWrapper()
     except DBConnectionError as ex:
         logging.error(f"Couldn't connect to DB: {str(ex)}")
Пример #9
0
 def get_info(self):
     parser = PageParser()
     ##htmmparser遇到/>就表示tag结尾,所以必须替换,遇到<br/>替换为BRBR,否则会解析失败
     htmlcontent = self.html
     htmlcontent = re.compile('<br/>').sub('BRBR', htmlcontent)
     parser.feed(htmlcontent)
     finalparseurl = parser.getdata()
     # print finalparseurl
     return finalparseurl
Пример #10
0
 def crawl(self):
     for conf in config_lists:
         for url in conf['urls']:
             resp = Downloader().download(url, conf)
             if resp:
                 proxy_list = PageParser().parse(resp, conf)
                 print(proxy_list)
                 print('正在验证代理可以用性')
                 valid_many(proxy_list, 'spider')
Пример #11
0
 def parse(self, url, url_id):
     print("parsing " + url + "...", file=sys.stderr)
     volume_id = self.get_volume_id(url_id)
     reader = self.open_page(url_id, volume_id)
     if reader:
         try:
             parser = PageParser(self, url)
             parser.parse_links(reader)
         finally:
             reader.close()
Пример #12
0
    def test_scroll_works(self):
        driver = self.driver

        action_handler = ActionHandler(driver)
        page_parser = PageParser(driver)

        for section_index in range(len(SECTIONS)):
            page_parser.get_dots()[section_index].click()
            time.sleep(3)
            action_handler.assert_active_section(SECTIONS, section_index)
Пример #13
0
 def get_urls(cls):
     while len(Test.urls) > 0:
         url = Test.get_url()
         try:
             Test.count += 1
             print(Test.count, url)
             analysis = PageParser(url)
             test = analysis.get_urls()
             Test.urls += test
         except:
             pass
Пример #14
0
def crawl_prxxy_by_pages(page_urls, queue):
    page_parser = PageParser.PageParser()
    for page_url in page_urls:
        headers = {'User-Agent': random.choice(constants.USER_AGENT)}
        r = requests.get(page_url, headers=headers)
        page_parser.set_html_doc(r.text)
        proxy_dict_list = page_parser.extract_proxy_urls()
        for proxy_dict in proxy_dict_list:
            if utils.check_https_proxy(proxy_dict):
                print('crawled a valid proxy:%s' %
                      utils.format_proxy_dict(proxy_dict))
                queue.put(proxy_dict)
Пример #15
0
    def test_parse(self):
        """
        测试了三个场景:
        使用标准url
        使用无效url
        使用其他格式的url文档,如jpg
        :return:
        """
        #parser = Page_parser()
        url1 = 'localhost:8081/page1.html'
        expect_sub_url = 'localhost:8081/1/page1_1.html'
        parser = PageParser(url1)
        links = parser.parse()
        self.assertIn(expect_sub_url, links)

        url2 = 'localhost:8081/page7.html'
        parser = PageParser(url2)
        links = parser.parse()
        self.assertEqual(links, set())

        url3 = 'localhost:8081/3/image.jpg'
        parser = PageParser(url3)
        self.assertEqual(parser.parse(), set())
Пример #16
0
    def test_show_community_info(self):
        driver = self.driver

        presence_checker = PresenceChecker(driver)
        page_parser = PageParser(driver)

        assert not presence_checker.is_there_drawer_container()
        assert not presence_checker.is_there_glass_container()

        followers_links = page_parser.get_follower_links()
        followers_links[FOLLOWING_INDEX].send_keys(Keys.RETURN)  #click

        assert presence_checker.is_there_drawer_container()
        assert presence_checker.is_there_glass_container()
Пример #17
0
    def parse(self, url_id, url, volume_id):
        print("parsing " + url + "...", file=sys.stderr)
        reader = self.open_page(url_id, volume_id)
        if reader:
            try:
                parser = PageParser(self, url)
                parser.parse_links(reader)
            finally:
                reader.close()

        self.cur.execute(
            """update field
set parsed=localtimestamp
where id=%s""", (url_id, ))
Пример #18
0
def dump():
    client = HttpClient()
    torrent_id = get_torrent_id()
    res = get_dump()
    new_records = []

    last_torrent_id = torrent_id
    direction = Direction.UP

    if direction == Direction.UP:
        increment = 1
    else:
        increment = -1

    i = 0
    failed = 0

    while run:
        last_torrent_id = last_torrent_id + increment
        print str(last_torrent_id)
        link = 'http://rutor.is/torrent/' + str(last_torrent_id)

        response = client.get_response(link)
        if not response.has_error:
            parser = PageParser(last_torrent_id, response.response_text)
            valid = parser.is_valid()
            if valid:
                failed = 0
                torrent_info = parser.parse()
                if torrent_info.category == u'Зарубежные фильмы' or torrent_info.category == u'Наши фильмы':
                    res.append(torrent_info)
                    new_records.append(torrent_info)
            else:
                print str(last_torrent_id) + ' is invalid'
                failed = failed + 1
                if failed == 10:
                    print 'end of torrent list reached'
                    last_torrent_id = last_torrent_id - 10 - 1
                    break

        i = i + 1

        time.sleep(4)

    dump = json.dumps(res, cls=MyEncoder, ensure_ascii=False)
    save_dump(dump)
    save_history(last_torrent_id + increment)
    save_to_db(new_records)
    print 'finished'
Пример #19
0
def gen_docs():
    page_list = []
    with open(config.DATA_DIR + 'page_list.txt') as fin:
        for line in fin:
            page_list.append(line.rstrip())
    template_name = config.TEMPLATE_DIR + 'doutula.template'
    template_parser = TemplateParser(template_name)
    page_parser = PageParser(template_parser.xpath_list)
    for page_url in page_list[1104:]:
        info_list = page_parser.parse(page_url)
        if len(info_list) > 0:
            for docinfo in info_list:
                print docinfo
        else:
            print 'page parse fail.'
Пример #20
0
    def test_dismiss_succeeded(self):
        driver = self.driver

        button_getter = ButtonGetter(driver)
        page_parser = PageParser(driver)

        dismiss_buttons = button_getter.get_dismiss_buttons()
        dismiss_titles = page_parser.get_dismiss_titles()

        dismiss_buttons[FOLLOWING_INDEX].click()

        time.sleep(1)

        assert dismiss_titles[
            FOLLOWING_INDEX].text not in page_parser.get_dismiss_titles()
Пример #21
0
def get_state(driver):
    parser = PageParser(driver)
    locator_elements = []
    elements = parser.get_usual_elements()[:50]
    print "Found %s elements " % len(elements)
    for element in elements:
        new_element = element_builder.build_element(driver, element)
        if new_element is not None:
            locator_elements.append(new_element)

    screenshot = driver.get_screenshot_as_base64()
    state = State(elements=locator_elements,
                  url=driver.current_url,
                  html=driver.html,
                  screenshot=screenshot)
    return state
Пример #22
0
    def test_login_failed_with_wrong_email(self):
        driver = self.driver

        action_handler = ActionHandler(driver)
        page_parser = PageParser(driver)

        assert not page_parser.get_error_message().is_displayed()
        assert "Tumblr" in driver.title

        action_handler.click_login_button()
        action_handler.type_and_confirm_email(EMAIL + EMAIL_WRONG_APPENDIX)

        time.sleep(2)

        assert not page_parser.get_password_input_field().is_displayed()
        assert page_parser.get_error_message().is_displayed()
Пример #23
0
def get_urls(url):
    global urls, counter
    try:
        #可选,进行抓取的url写入一个文件中,但会增加I/O操作
        # with open('url_list.txt','a') as test:
        # 	test.write(url + '\n')
        data.delete(url)
        print(url)
        analysis = PageParser(url)
        for i in analysis.get_urls():
            if data.check(i):
                data.delete(i)
            else:
                data.insert(i)
    except:
        pass
Пример #24
0
    def crawl(self, url_q):
        """
        spider的爬取逻辑, 调用page_retriever解析下载url, 将提取的子url返回来
        并进行去重,加到队列中
        :param url_q: 待解析的url地址,绝对路径
        :return:
        """
        if not isinstance(url_q, tuple):
            print("Type error")
            return

        if CrawlerThreadPool.interval_links_cnt > \
                ConfReader.instance().get_max_links_count():
            interval = ConfReader.instance().get_crawl_interval()
            if interval == 0:
                interval = 60 * 5  # default every 5 minutes

            logger.info("Thread %s begin to sleep, %d s later continue" %
                        (threading.currentThread().getName(), interval))
            print("Waiting for %d seconds ..." % interval)
            sleep(interval)

            #重新计数
            self._lock.acquire()
            CrawlerThreadPool.interval_links_cnt = 0
            self._lock.release()
        else:
            pass

        (url, depth) = url_q
        if depth > ConfReader.instance().get_max_depth():
            print("Depth exceed. The max depth is {}".format(depth - 1))
            return
        page_parser = PageParser(url)
        links = page_parser.parse()
        new_links = links.difference(CrawlerThreadPool.seen_urls)
        for new_link in new_links:
            self._q.put((new_link, depth + 1))

        #statistic links number
        self._lock.acquire()
        CrawlerThreadPool.total_links += len(new_links)
        CrawlerThreadPool.interval_links_cnt += len(new_links)
        print("Spider have crawl {} links.".format(
            CrawlerThreadPool.total_links))
        CrawlerThreadPool.seen_urls.update(new_links)
        self._lock.release()
Пример #25
0
    def test_search_success(self):
        driver = self.driver

        action_handler = ActionHandler(driver)
        page_parser = PageParser(driver)

        assert not page_parser.get_search_results_container().is_displayed()

        action_handler.type_search_query(SEARCH_QEURY)

        time.sleep(1)

        assert page_parser.get_search_results_container().is_displayed()

        action_handler.confirm_search_query()

        assert len(page_parser.get_posts_content()) >= MIN_NUMBER_OF_POSTS
Пример #26
0
    def test_reblog_succeeded(self):
        driver = self.driver

        button_getter = ButtonGetter(driver)
        presence_checker = PresenceChecker(driver)
        post_operator = PostOperator(driver)
        page_parser = PageParser(driver)

        post = post_operator.get_posts()[FIRST_POST_INDEX + FOLLOWING_INDEX]
        post_id = post_operator.get_post_id(post)

        old_posted_count = page_parser.get_posted_count()

        button_getter.get_reblog_button(post).click()

        time.sleep(1)

        assert presence_checker.is_there_post_modal_container()

        page_parser.get_reblog_text_field().send_keys(POST_NOTE)

        post_settings_button = button_getter.get_post_settings_button()

        assert not presence_checker.is_there_post_settings_dropdown()
        post_settings_button.click()

        time.sleep(1)

        assert presence_checker.is_there_post_settings_dropdown()
        post_settings_button.click()

        time.sleep(1)

        button_getter.get_reblog_applying_button().click()

        time.sleep(2)

        new_posted_count = page_parser.get_posted_count()

        assert old_posted_count + 1 == new_posted_count

        self.driver.get(
            page_parser.get_post_avatar_link().get_attribute('href'))

        assert presence_checker.is_there_reblogged_post(
            post_operator, post_id, POST_NOTE)
Пример #27
0
    def test_like_succeeded(self):
        driver = self.driver

        post_operator = PostOperator(driver)
        button_getter = ButtonGetter(driver)
        presence_checker = PresenceChecker(driver)
        page_parser = PageParser(driver)

        time.sleep(2)

        posts = post_operator.get_posts()

        old_liked_count = page_parser.get_liked_count(URL)
        like_button = button_getter.get_like_button(posts[FIRST_POST_INDEX +
                                                          FOLLOWING_INDEX])
        post_id = post_operator.get_post_id(posts[FIRST_POST_INDEX +
                                                  FOLLOWING_INDEX])

        like_button.click()

        time.sleep(1)

        assert 'liked' in like_button.get_attribute('class')

        self.driver.get(URL + "/likes")

        assert presence_checker.is_there_post(post_operator, post_id)

        new_liked_count = page_parser.get_liked_count(URL)

        assert new_liked_count == old_liked_count + 1

        time.sleep(2)

        button_getter.get_like_button_by_post_id(post_operator,
                                                 post_id).click()

        driver.refresh()

        assert button_getter.get_like_button_by_post_id(
            post_operator, post_id) is None

        new_liked_count = page_parser.get_liked_count(URL)

        assert new_liked_count == old_liked_count
Пример #28
0
    def test_creation_post_with_plain_text(self):
        driver = self.driver

        button_getter = ButtonGetter(driver)
        post_operator = PostOperator(driver)
        page_parser = PageParser(driver)
        action_handler = ActionHandler(driver)

        post_title = 'Hello world'

        post_body = 'Lorem ipsum dolor sit amet'

        post_hashes = '#test'

        posted_count_before_adding_post = page_parser.get_posted_count()

        button_getter.get_create_button().click()

        time.sleep(1)

        button_getter.get_post_type_selection_button('text').click()

        time.sleep(1)

        page_parser.get_text_post_title_input_field().send_keys(post_title)
        page_parser.get_text_post_description_input_field().send_keys(
            post_body)
        page_parser.get_text_post_tag_input_field().send_keys(post_hashes)

        action_handler.click_confirm_post()

        time.sleep(1)

        driver.refresh()

        posted_count_after_adding_post = page_parser.get_posted_count()

        assert posted_count_after_adding_post == posted_count_before_adding_post + 1

        time.sleep(1)

        button_getter.get_account_button().click()

        time.sleep(1)

        button_getter.get_posts_button().click()

        added_post = post_operator.get_text_post(post_title, post_body)

        assert added_post is not None

        post_operator.get_control_menu_button(added_post).click()

        time.sleep(0.5)

        post_operator.get_delete_button(added_post).click()

        time.sleep(1)

        button_getter.get_ok_button().click()

        time.sleep(1)

        driver.refresh()

        assert post_operator.get_text_post(post_title, post_body) is None

        posted_count_after_deleting_post = page_parser.get_posted_count()

        assert posted_count_after_adding_post == posted_count_after_deleting_post + 1
Пример #29
0
    def crawl(self, url):
        pp = PageParser(url)
        pp.download_css_files()

        pp.extract_shop_info()
        pp.extract_comments()
Пример #30
0
 def __init__(self, driver):
     self.page_parser = PageParser(driver)
     self.button_getter = ButtonGetter(driver)