Пример #1
0
 def parse_file(self, url, page_file):
     try:
         self.children = set()
         parser = PageParser(self, url)
         parser.parse_links(page_file)
     finally:
         page_file.close()
    def __init__(self, month, year, threshold):
        self.const_one_month_in_days = 30

        self.article_creation_threshold = threshold
        self.threshold_very_active = 100
        self.threshold_active = 5
        self.const_very_active = "Very Active"
        self.const_active = "Active"
        self.const_not_active = "Not Active"
        self.month = month
        self.year = year

        self.const_max_requests = 500
        self.url_userinfo = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=users"
        self.url_usercontb = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=usercontribs&"
        self.url_propages = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=prefixsearch&"
        self.url_contributors = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=contributors&"

        self.debug = False

        self.list_editors_sorted = []
        self.editors_WIR = {}
        self.dict_editor_article = {}
        self.dict_editor_infoboxes = {}

        self.set_members = set()
        self.parser = PageParser()
        self.tabler = TableGenerator()
Пример #3
0
    def test_follow_succeeded(self):
        driver = self.driver

        button_getter = ButtonGetter(driver)
        page_parser = PageParser(driver)

        follow_buttons = button_getter.get_follow_buttons()

        followed_title = follow_buttons[FOLLOWING_INDEX].get_attribute(
            'data-tumblelog-name')
        follow_buttons[FOLLOWING_INDEX].send_keys(Keys.RETURN)  #click

        self.driver.get(URL + "/following")

        assert followed_title in page_parser.get_following()

        button_getter.get_unfollow_button(followed_title).click()

        time.sleep(1)

        button_getter.get_ok_button().click()

        time.sleep(1)

        self.driver.refresh()

        assert followed_title not in page_parser.get_following()
Пример #4
0
def get_new_state(driver, old_state):
    parser = PageParser(driver)
    state_elements = []
    live_elements = parser.get_all_elements()
    #first look for the old elements that are still present
    for element in old_state.elements:
        webelement = WebElement(driver, element.locators)
        if webelement.is_present(1):
            webelement.highlight(color="green")
            if webelement in live_elements:
                state_elements.append(element)
                live_elements.remove(webelement)

    #look for any changed elements
    for element in live_elements:
        element.highligbht(color="blue")
        new_element_state = element_builder.build_element(driver, element)
        if new_element_state is None:
            logging.error("No locators found for element %s" % element)
        else:
            state_elements.append(new_element_state)

    return State(elements=state_elements,
                 url=driver.current_url,
                 html=driver.html,
                 screenshot=driver.get_screenshot_as_base64())
Пример #5
0
 def __init__(self, base_url):
     self._base_url = base_url
     self._page_parser = PageParser()
     try:
         self._mongo_wrapper = MongoWrapper()
     except DBConnectionError as ex:
         logging.error(f"Couldn't connect to DB: {str(ex)}")
Пример #6
0
    def test_scroll_works(self):
        driver = self.driver

        action_handler = ActionHandler(driver)
        page_parser = PageParser(driver)

        for section_index in range(len(SECTIONS)):
            page_parser.get_dots()[section_index].click()
            time.sleep(3)
            action_handler.assert_active_section(SECTIONS, section_index)
Пример #7
0
 def parse(self, url, url_id):
     print("parsing " + url + "...", file=sys.stderr)
     volume_id = self.get_volume_id(url_id)
     reader = self.open_page(url_id, volume_id)
     if reader:
         try:
             parser = PageParser(self, url)
             parser.parse_links(reader)
         finally:
             reader.close()
Пример #8
0
    def test_url_read(self):
        """
        测试了三个场景:
        使用标准url
        使用无效url
        使用其他格式的url文档,如jpg
        :return:
        """
        url1 = 'localhost:8081/page1.html'
        parser = PageParser(url1)
        content1 = parser.url_read()
        self.assertEqual(content1.__contains__('page1_4.html'), True)
        self.assertEqual(content1.__contains__('page1_1.html'), True)

        #invalid url test
        url2 = 'localhost:8081/page7.html'
        parser = PageParser(url2)
        content2 = parser.url_read()
        self.assertEqual(content2, '', "return content should be empty")

        #No support url test
        url3 = 'localhost:8081/3/image.jpg'
        parser = PageParser(url3)
        content3 = parser.url_read()
        self.assertEqual(content3, '')
        self.assertLogs(logger='../logs/spider.log', level='error')
Пример #9
0
 def get_urls(cls):
     while len(Test.urls) > 0:
         url = Test.get_url()
         try:
             Test.count += 1
             print(Test.count, url)
             analysis = PageParser(url)
             test = analysis.get_urls()
             Test.urls += test
         except:
             pass
Пример #10
0
	def get_urls(cls):
		while len(Test.urls) > 0:
			url = Test.get_url()
			try:
				Test.count += 1
				print(Test.count,url)
				analysis = PageParser(url)
				test = analysis.get_urls()
				Test.urls += test
			except:
				pass
Пример #11
0
    def test_show_community_info(self):
        driver = self.driver

        presence_checker = PresenceChecker(driver)
        page_parser = PageParser(driver)

        assert not presence_checker.is_there_drawer_container()
        assert not presence_checker.is_there_glass_container()

        followers_links = page_parser.get_follower_links()
        followers_links[FOLLOWING_INDEX].send_keys(Keys.RETURN)  #click

        assert presence_checker.is_there_drawer_container()
        assert presence_checker.is_there_glass_container()
Пример #12
0
    def parse(self, url_id, url, volume_id):
        print("parsing " + url + "...", file=sys.stderr)
        reader = self.open_page(url_id, volume_id)
        if reader:
            try:
                parser = PageParser(self, url)
                parser.parse_links(reader)
            finally:
                reader.close()

        self.cur.execute(
            """update field
set parsed=localtimestamp
where id=%s""", (url_id, ))
Пример #13
0
def dump():
    client = HttpClient()
    torrent_id = get_torrent_id()
    res = get_dump()
    new_records = []

    last_torrent_id = torrent_id
    direction = Direction.UP

    if direction == Direction.UP:
        increment = 1
    else:
        increment = -1

    i = 0
    failed = 0

    while run:
        last_torrent_id = last_torrent_id + increment
        print str(last_torrent_id)
        link = 'http://rutor.is/torrent/' + str(last_torrent_id)

        response = client.get_response(link)
        if not response.has_error:
            parser = PageParser(last_torrent_id, response.response_text)
            valid = parser.is_valid()
            if valid:
                failed = 0
                torrent_info = parser.parse()
                if torrent_info.category == u'Зарубежные фильмы' or torrent_info.category == u'Наши фильмы':
                    res.append(torrent_info)
                    new_records.append(torrent_info)
            else:
                print str(last_torrent_id) + ' is invalid'
                failed = failed + 1
                if failed == 10:
                    print 'end of torrent list reached'
                    last_torrent_id = last_torrent_id - 10 - 1
                    break

        i = i + 1

        time.sleep(4)

    dump = json.dumps(res, cls=MyEncoder, ensure_ascii=False)
    save_dump(dump)
    save_history(last_torrent_id + increment)
    save_to_db(new_records)
    print 'finished'
Пример #14
0
    def test_dismiss_succeeded(self):
        driver = self.driver

        button_getter = ButtonGetter(driver)
        page_parser = PageParser(driver)

        dismiss_buttons = button_getter.get_dismiss_buttons()
        dismiss_titles = page_parser.get_dismiss_titles()

        dismiss_buttons[FOLLOWING_INDEX].click()

        time.sleep(1)

        assert dismiss_titles[
            FOLLOWING_INDEX].text not in page_parser.get_dismiss_titles()
Пример #15
0
def gen_docs():
    page_list = []
    with open(config.DATA_DIR + 'page_list.txt') as fin:
        for line in fin:
            page_list.append(line.rstrip())
    template_name = config.TEMPLATE_DIR + 'doutula.template'
    template_parser = TemplateParser(template_name)
    page_parser = PageParser(template_parser.xpath_list)
    for page_url in page_list[1104: ]:
        info_list = page_parser.parse(page_url)
        if len(info_list) > 0:
            for docinfo in info_list:
                print docinfo
        else:
            print 'page parse fail.'
Пример #16
0
def gen_docs():
    page_list = []
    with open(config.DATA_DIR + 'page_list.txt') as fin:
        for line in fin:
            page_list.append(line.rstrip())
    template_name = config.TEMPLATE_DIR + 'doutula.template'
    template_parser = TemplateParser(template_name)
    page_parser = PageParser(template_parser.xpath_list)
    for page_url in page_list[1104:]:
        info_list = page_parser.parse(page_url)
        if len(info_list) > 0:
            for docinfo in info_list:
                print docinfo
        else:
            print 'page parse fail.'
Пример #17
0
    def test_login_failed_with_wrong_email(self):
        driver = self.driver

        action_handler = ActionHandler(driver)
        page_parser = PageParser(driver)

        assert not page_parser.get_error_message().is_displayed()
        assert "Tumblr" in driver.title

        action_handler.click_login_button()
        action_handler.type_and_confirm_email(EMAIL + EMAIL_WRONG_APPENDIX)

        time.sleep(2)

        assert not page_parser.get_password_input_field().is_displayed()
        assert page_parser.get_error_message().is_displayed()
Пример #18
0
def get_urls(url):
    global urls, counter
    try:
        #可选,进行抓取的url写入一个文件中,但会增加I/O操作
        # with open('url_list.txt','a') as test:
        # 	test.write(url + '\n')
        data.delete(url)
        print(url)
        analysis = PageParser(url)
        for i in analysis.get_urls():
            if data.check(i):
                data.delete(i)
            else:
                data.insert(i)
    except:
        pass
Пример #19
0
 def __init__(self, output_tar, error_tar, save):
     self.save = save
     self.output_tar = output_tar
     self.error_tar = error_tar
     self.parser = PageParser()
     self.lock = threading.Lock()
     self.items = {}
Пример #20
0
def get_state(driver):
    parser = PageParser(driver)
    locator_elements = []
    elements = parser.get_usual_elements()[:50]
    print "Found %s elements " % len(elements)
    for element in elements:
        new_element = element_builder.build_element(driver, element)
        if new_element is not None:
            locator_elements.append(new_element)

    screenshot = driver.get_screenshot_as_base64()
    state = State(elements=locator_elements,
                  url=driver.current_url,
                  html=driver.html,
                  screenshot=screenshot)
    return state
Пример #21
0
    def crawl(self, url_q):
        """
        spider的爬取逻辑, 调用page_retriever解析下载url, 将提取的子url返回来
        并进行去重,加到队列中
        :param url_q: 待解析的url地址,绝对路径
        :return:
        """
        if not isinstance(url_q, tuple):
            print("Type error")
            return

        if CrawlerThreadPool.interval_links_cnt > \
                ConfReader.instance().get_max_links_count():
            interval = ConfReader.instance().get_crawl_interval()
            if interval == 0:
                interval = 60 * 5  # default every 5 minutes

            logger.info("Thread %s begin to sleep, %d s later continue" %
                        (threading.currentThread().getName(), interval))
            print("Waiting for %d seconds ..." % interval)
            sleep(interval)

            #重新计数
            self._lock.acquire()
            CrawlerThreadPool.interval_links_cnt = 0
            self._lock.release()
        else:
            pass

        (url, depth) = url_q
        if depth > ConfReader.instance().get_max_depth():
            print("Depth exceed. The max depth is {}".format(depth - 1))
            return
        page_parser = PageParser(url)
        links = page_parser.parse()
        new_links = links.difference(CrawlerThreadPool.seen_urls)
        for new_link in new_links:
            self._q.put((new_link, depth + 1))

        #statistic links number
        self._lock.acquire()
        CrawlerThreadPool.total_links += len(new_links)
        CrawlerThreadPool.interval_links_cnt += len(new_links)
        print("Spider have crawl {} links.".format(
            CrawlerThreadPool.total_links))
        CrawlerThreadPool.seen_urls.update(new_links)
        self._lock.release()
Пример #22
0
 def count_words(job_url, search_words):
     job_page = pp.get_page(job_url)
     word_counter = {a: 0 for a in search_words}
     for i in range(len(job_page)):
         for w in search_words:
             if w.lower() == job_page[i:(i + len(w))].lower():
                 word_counter[w] += 1
     return word_counter
Пример #23
0
    def test_search_success(self):
        driver = self.driver

        action_handler = ActionHandler(driver)
        page_parser = PageParser(driver)

        assert not page_parser.get_search_results_container().is_displayed()

        action_handler.type_search_query(SEARCH_QEURY)

        time.sleep(1)

        assert page_parser.get_search_results_container().is_displayed()

        action_handler.confirm_search_query()

        assert len(page_parser.get_posts_content()) >= MIN_NUMBER_OF_POSTS
Пример #24
0
    def test_like_succeeded(self):
        driver = self.driver

        post_operator = PostOperator(driver)
        button_getter = ButtonGetter(driver)
        presence_checker = PresenceChecker(driver)
        page_parser = PageParser(driver)

        time.sleep(2)

        posts = post_operator.get_posts()

        old_liked_count = page_parser.get_liked_count(URL)
        like_button = button_getter.get_like_button(posts[FIRST_POST_INDEX +
                                                          FOLLOWING_INDEX])
        post_id = post_operator.get_post_id(posts[FIRST_POST_INDEX +
                                                  FOLLOWING_INDEX])

        like_button.click()

        time.sleep(1)

        assert 'liked' in like_button.get_attribute('class')

        self.driver.get(URL + "/likes")

        assert presence_checker.is_there_post(post_operator, post_id)

        new_liked_count = page_parser.get_liked_count(URL)

        assert new_liked_count == old_liked_count + 1

        time.sleep(2)

        button_getter.get_like_button_by_post_id(post_operator,
                                                 post_id).click()

        driver.refresh()

        assert button_getter.get_like_button_by_post_id(
            post_operator, post_id) is None

        new_liked_count = page_parser.get_liked_count(URL)

        assert new_liked_count == old_liked_count
Пример #25
0
 def get_info(self):
     parser = PageParser()
     ##htmmparser遇到/>就表示tag结尾,所以必须替换,遇到<br/>替换为BRBR,否则会解析失败
     htmlcontent = self.html
     htmlcontent = re.compile('<br/>').sub('BRBR', htmlcontent)
     parser.feed(htmlcontent)
     finalparseurl = parser.getdata()
     # print finalparseurl
     return finalparseurl
Пример #26
0
 def crawl(self):
     for conf in config_lists:
         for url in conf['urls']:
             resp = Downloader().download(url, conf)
             if resp:
                 proxy_list = PageParser().parse(resp, conf)
                 print(proxy_list)
                 print('正在验证代理可以用性')
                 valid_many(proxy_list, 'spider')
Пример #27
0
    def get_links(search_word):
        start_url = 'https://rabota.by/search/vacancy?area=1002&fromSearchLine=true&st=searchVacancy&text='

        start_page = pp.get_page(start_url + search_word)
        start_soup = BeautifulSoup(start_page, 'lxml')
        vacancies = []
        for link in start_soup.find_all('a', href=True):
            if link['href'][0:18] == 'https://rabota.by/':
                vacancies.append(link['href'])
        return vacancies
Пример #28
0
 def fail_search(search_word):
     search_url = 'https://rabota.by/search/vacancy?area=1002&fromSearchLine=true&st=searchVacancy&text=' + search_word
     search_page = pp.get_page(search_url)
     search_soup = BeautifulSoup(search_page, 'lxml')
     key = search_soup.find_all('h1')
     no_patern = 'ничего не надено'
     if str(key[0]).find(no_patern):
         return True
     else:
         return False
Пример #29
0
class ActionHandler(ActionHandlerGeneral):
    def __init__(self, driver):
        self.page_parser = PageParser(driver)
        self.button_getter = ButtonGetter(driver)

    def type_and_confirm_language(self, language):
        language_selector = self.page_parser.get_language_selector()
        language_selector.send_keys(language)
        language_selector.send_keys(Keys.RETURN)

    def type_language(self, language):
        language_selector = self.page_parser.get_language_selector()
        language_selector.send_keys(language)

    def confirm_language(self):
        language_selector = self.page_parser.get_language_selector()
        language_selector.send_keys(Keys.RETURN)

    def click_delete_account_button(self):
        self.button_getter.get_delete_account_button().click()
Пример #30
0
def crawl_prxxy_by_pages(page_urls, queue):
    page_parser = PageParser.PageParser()
    for page_url in page_urls:
        headers = {'User-Agent': random.choice(constants.USER_AGENT)}
        r = requests.get(page_url, headers=headers)
        page_parser.set_html_doc(r.text)
        proxy_dict_list = page_parser.extract_proxy_urls()
        for proxy_dict in proxy_dict_list:
            if utils.check_https_proxy(proxy_dict):
                print('crawled a valid proxy:%s' %
                      utils.format_proxy_dict(proxy_dict))
                queue.put(proxy_dict)
Пример #31
0
def handle_link_shared(event):
    unfurls = {}
    for link in event.get('links'):
        url = link.get('url')
        origin = requests.get(url)
        p = PageParser()
        p.feed(origin.text)
        p.close()

        if p.content_type == PageParser.IMAGE:
            unfurls[url] = {
                'text': 'image',
                'image_url': p.content
            }
    response = requests.post('https://slack.com/api/chat.unfurl',
                             json={
                                 'token': WEB_API_TOKEN,
                                 'channel': event.get('channel'),
                                 'ts': event.get('message_ts'),
                                 'unfurls': unfurls
                             },
                             headers={
                                 'Content-type': 'application/json;charset=utf-8',
                                 'Authorization': 'Bearer %s' % WEB_API_TOKEN
                             })

    print('unfurl %s' % response.text)
    return('Done')
Пример #32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_pages', type=int)
    parser.add_argument('--update_tag_db', '-u', help='Upsert scraped data to database',
                        required=False, action='store_true')
    args = parser.parse_args()

    db = Database()
    if db.is_empty('Tag') or args.update_tag_db:
        tag_parser = PageParser("tag")
        tags = tag_parser.get_pages(Tag, MAX)
        db.update_tag_table(tags)

    print("Getting Question Summaries...")
    summary_parser = PageParser("question_summary")
    summaries = summary_parser.get_pages(QuestionSummary, args.num_pages)

    print("Getting Articles...")
    article_parser = APIParser()
    articles = article_parser.get_responses(Article, summaries)

    #Enrich Question Summary with articles
    for question_summary, articles_list in zip(summaries, articles):
        question_summary.add_articles(articles_list)

    print("Populating DB...")
    db.insert_question_summaries(summaries)
Пример #33
0
    def test_parse(self):
        """
        测试了三个场景:
        使用标准url
        使用无效url
        使用其他格式的url文档,如jpg
        :return:
        """
        #parser = Page_parser()
        url1 = 'localhost:8081/page1.html'
        expect_sub_url = 'localhost:8081/1/page1_1.html'
        parser = PageParser(url1)
        links = parser.parse()
        self.assertIn(expect_sub_url, links)

        url2 = 'localhost:8081/page7.html'
        parser = PageParser(url2)
        links = parser.parse()
        self.assertEqual(links, set())

        url3 = 'localhost:8081/3/image.jpg'
        parser = PageParser(url3)
        self.assertEqual(parser.parse(), set())
Пример #34
0
 def create_apartment_body(self, html, url):
     pp = PageParser(html)
     return pp.create_apartment_page(url)
Пример #35
0
class NewEggCrawlHandler(crawle.Handler):
    ITEM_URL_PREFIX = 'http://www.newegg.com/Product/Product.aspx\?Item='
    CART_URL = 'http://secure.newegg.com/Shopping/ShoppingCart.aspx'
    MAP_URL_PREFIX = 'http://www.newegg.com/Product/MappingPrice.aspx?Item='
    ZIP_COOKIE = ''.join(['NV%5FORDERCOOKIE=#4%7b%22Sites%22%3a%7b%22USA%22',
                          '%3a%7b%22Values%22%3a%7b',
                          '%22NVS%255FCUSTOMER%255FSHIPPING%255FMETHOD1%22',
                          '%3a%22038%22%2c',
                          '%22NVS%255FCUSTOMER%255FZIP%255FCODE%22%3a',
                          '%2293117%22%7d%7d%7d%7d'])

    ITEM = 0
    CART = 1
    MAPPING = 2

    @staticmethod
    def transform_id(id):
        return '%s-%s-%s' % (id[7:9], id[9:12], id[12:])
    
    def __init__(self, output_tar, error_tar, save):
        self.save = save
        self.output_tar = output_tar
        self.error_tar = error_tar
        self.parser = PageParser()
        self.lock = threading.Lock()
        self.items = {}

    def handle_error(self, rr):
        if not self.save: return
        temp_file = StringIO()
        cPickle.dump(rr, temp_file, cPickle.HIGHEST_PROTOCOL)
        temp_file.seek(0)
        info = tarfile.TarInfo('error/%s-%s' % rr.request_url)
        info.size = len(temp_file.buf)
        info.mtime = time.time()
        self.lock.acquire()
        self.error_tar.members = []
        self.error_tar.addfile(info, temp_file)
        self.lock.release()
        temp_file.close()
    
    def save_page(self, rr):
        if not self.save: return
        temp_file = StringIO()
        cPickle.dump(rr, temp_file, cPickle.HIGHEST_PROTOCOL)
        temp_file.seek(0)
        info = tarfile.TarInfo('pages/%s-%s' % rr.request_url)
        info.size = len(temp_file.buf)
        info.mtime = time.time()
        self.lock.acquire()
        self.output_tar.members = []
        self.output_tar.addfile(info, temp_file)
        self.lock.release()
        temp_file.close()

    def pre_process(self, rr):
        if not isinstance(rr.request_url, tuple):
            print 'Something slid by: %s' % rr.response_url
        item_id, r_type = rr.request_url
        if r_type == self.ITEM:
            rr.response_url = ''.join([self.ITEM_URL_PREFIX, item_id])
        elif r_type == self.CART:
            rr.response_url = self.CART_URL
            c_id = ''.join(['NV%5FNEWEGGCOOKIE=#4{"Sites":{"USA":{"Values":{"',
                            self.transform_id(item_id), '":"1"}}}}'])
            rr.request_headers = {'Cookie':';'.join([self.ZIP_COOKIE, c_id])}
        elif r_type == self.MAPPING:
            rr.response_url = ''.join([self.MAP_URL_PREFIX, item_id])
        else:
            raise Exception('Unknown type')
        rr.redirects = 0

    def process(self, rr, queue):
        if rr.response_status == None:
            try:
                if isinstance(rr.error, socket.error):
                    queue.put(rr.request_url)
                elif isinstance(rr.error, crawle.CrawleRedirectsExceeded):
                    pass
                else:
                    self.handle_error(rr)
            except:
                self.handle_error(rr)
            return
        elif rr.response_status != 200:
            self.handle_error(rr)
            return
        item_id, r_type = rr.request_url
        if r_type == self.ITEM:
            info = self.parser.parse_item_page_info(item_id, rr.response_body)
            if not info:
                return
            if 'deactivated' not in info and 'price' not in info:
                queue.put((item_id, self.CART))
        elif r_type == self.CART:
            info = self.parser.parse_cart_page(item_id, rr.response_body)
            if not info:
                queue.put((item_id, self.MAPPING))
                return
        elif r_type == self.MAPPING:
            info = self.parser.parse_mapping_page(item_id, rr.response_body)
        else:
            raise Exception('Unknown Type')
        self.lock.acquire()
        if r_type == self.ITEM:
            self.items[item_id] = info
        else:
            self.items[item_id].update(info)
        self.lock.release()
        self.save_page(rr)