def get_change_info_detail(self, match_feature, detail):
        before = ''
        after = ''
        if detail is None:
            return before, after

        def get_change_list_detail(table):
            tr_list = table.find('tr').items()
            position_list = []
            for tr in tr_list:
                td_list = tr.find('td')
                if len(td_list) < 3:
                    continue
                position_list.append(td_list.eq(1).text().strip('*').strip() + ':' + td_list.eq(2).text())

            return ','.join(position_list)

        for item in detail:
            feature = item.get('match_feature')
            if feature is None:
                continue
            if feature.strip() not in match_feature and match_feature not in feature:
                continue
            text = item.get('text')
            if text is None or text.strip() == '':
                return before, after
            table_list = PyQuery(text, parser='html').find('.table-result')
            before = get_change_list_detail(table_list.eq(1))
            after = get_change_list_detail(table_list.eq(2))
            break

        return before, after
Пример #2
0
    def search(self, word):
        response = requests.get(self.URL.format(word=word))
        text = response.text

        doc = PyQuery(text)
        results = []
        table = doc("table.school-course")
        if table:
            # print(table)
            # table = table[0]
            table = PyQuery(table)
            for tr in table('tr'):
                tr = PyQuery(tr)
                if tr('th'):
                    continue

                td = tr('td')
                td = PyQuery(td)

                result = {
                    'word': td.eq(0).text().split('(')[0],
                    'meaning': td.eq(1).text()
                }
                if result['word'][-1] == ' ':
                    result['word'] = result['word'][:-1]
                results.append(result)
        if results:
            return {"status": 'success', "results": results}
        else:
            return {"status": 'error', "error_detail": "Nothing found."}
Пример #3
0
def get_market_game_trade_card_price(game_id, login_cookie):
    cookies_list = {"steamLogin": login_cookie}
    market_search_url = "http://steamcommunity.com/market/search/render/"
    market_search_url += "?query=&count=20&appid=753&category_753_Game[0]=tag_app_%s&category_753_cardborder[0]=tag_cardborder_0" % game_id
    market_search_response = net.http_request(market_search_url,
                                              method="GET",
                                              cookies_list=cookies_list,
                                              json_decode=True)
    if market_search_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(market_search_response.status))
    market_item_list = {}
    if not crawler.check_sub_key(
        ("success", "results_html"), market_search_response.json_data):
        raise crawler.CrawlerException(
            "返回信息'success'或'results_html'字段不存在\n%s" %
            market_search_response.json_data)
    if market_search_response.json_data["success"] is not True:
        raise crawler.CrawlerException("返回信息'success'字段取值不正确\n%s" %
                                       market_search_response.json_data)
    card_selector = PQ(market_search_response.json_data["results_html"]).find(
        ".market_listing_row_link")
    for index in range(0, card_selector.length):
        card_name = card_selector.eq(index).find(
            ".market_listing_item_name").text()
        card_min_price = card_selector.eq(index).find(
            "span.normal_price span.normal_price").text().encode(
                "UTF-8").replace("¥ ", "")
        market_item_list[card_name] = card_min_price
    # {'Pamu': '1.77', 'Fumi (Trading Card)': '2.14', 'Mio (Trading Card)': '1.33', 'Bonnibel (Trading Card)': '1.49', 'Groupshot': '1.87', 'Q-Piddy': '1.35', 'Elle (Trading Card)': '1.19', 'Quill': '1.50', 'Iro (Trading Card)': '1.42', 'Bearverly (Trading Card)': '1.27', 'Cassie (Trading Card)': '1.35'}
    return market_item_list
Пример #4
0
def get_album_page(album_id):
    page_count = max_page_count = 1
    result = {
        "album_title": "",  # 图集标题
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
    }
    while page_count <= max_page_count:
        album_pagination_url = "http://www.youzi4.cc/mm/%s/%s_%s.html" % (
            album_id, album_id, page_count)
        album_pagination_response = net.http_request(album_pagination_url,
                                                     method="GET")
        if album_pagination_response.status == 404 and page_count == 1:
            result["is_delete"] = True
            return result
        if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "第%s页 " % page_count +
                crawler.request_failre(album_pagination_response.status))
        # 判断图集是否已经被删除
        if page_count == 1:
            # 获取图集标题
            album_title = PQ(album_pagination_response.data.decode(
                "UTF-8")).find("meta[name='description']").attr("content")
            if not album_title:
                raise crawler.CrawlerException("页面截取标题失败\n%s" %
                                               album_pagination_response.data)
            result["album_title"] = album_title.encode("UTF-8")
        # 获取图集图片地址
        image_list_selector = PQ(
            album_pagination_response.data).find("div.articleV4Body a img")
        if image_list_selector.length == 0:
            raise crawler.CrawlerException(
                "第%s页 页面匹配图片地址失败\n%s" %
                (page_count, album_pagination_response.data))
        for image_index in range(0, image_list_selector.length):
            result["image_url_list"].append(
                str(image_list_selector.eq(image_index).attr("src")))
        # 获取总页数
        pagination_list_selector = PQ(
            album_pagination_response.data).find("ul.articleV4Page a.page-a")
        if pagination_list_selector.length > 0:
            for pagination_index in range(0, pagination_list_selector.length):
                temp_page_count = pagination_list_selector.eq(
                    pagination_index).html()
                if crawler.is_integer(temp_page_count):
                    max_page_count = max(int(temp_page_count), max_page_count)
        else:
            if page_count > 1:
                raise crawler.CrawlerException(
                    "第%s页 页面匹配分页信息失败\n%s" %
                    (page_count, album_pagination_response.data))
        page_count += 1
    return result
def get_user_events(data):
    user_events = {}
    query = PyQuery(data)("#platnosci")("table")("tr")
    print(query.eq(0))

    for iter, row in enumerate(query):
        row_data = {}
        row_data["title"]  = str(query.eq(iter)("td").eq(0)("a").html()).replace(r"<br />", ";")
        row_data["sign_in_url"] = query.eq(iter)("td").eq(0)("a").attr("href")
        row_data["edit_url"] = query.eq(iter)("td").eq(2)("a").attr("href")
        row_data["state"] = query.eq(iter)("td").eq(1).text()
        user_events[iter] = row_data

    return user_events
Пример #6
0
def define_external_type(url: str) -> None:
    """
    Used for adding types from ``schema.org`` domain to ``types``. Fetches ``url`` and looks for parents,
    which are also recursively added to ``types``.

    Parameters
    ----------
    url : str
        URL of the type. Should be from ``schema.org`` domain.
    """

    global types

    if url in types:
        return

    types[url] = {
        'label': url[url.rfind('/') + 1:],
        'description': '',
        'parents': []
    }
    candidates = PyQuery(url)('link')

    for i in range(len(candidates)):
        link = candidates.eq(i)
        if link.attr('property') == 'rdfs:subClassOf':
            parent = link.attr('href')

            if len(parent) > 0:
                if parent not in types[url]['parents']:
                    types[url]['parents'].append(parent)
                define_external_type(parent)
Пример #7
0
def get_self_account_badges(account_id, login_cookie):
    # 徽章第一页
    badges_index_url = "http://steamcommunity.com/profiles/%s/badges/" % account_id
    cookies_list = {"steamLogin": login_cookie}
    badges_index_response = net.http_request(badges_index_url,
                                             method="GET",
                                             cookies_list=cookies_list)
    if badges_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(badges_index_response.status))
    badges_detail_url_list = []
    # 徽章div
    badges_selector = PQ(badges_index_response.data).find(
        ".maincontent .badges_sheet .badge_row")
    for index in range(0, badges_selector.length):
        badge_html = badges_selector.eq(index).html().encode("UTF-8")
        # 已经掉落全部卡牌的徽章
        if badge_html.find("无剩余卡牌掉落") >= 0:
            # 徽章详细信息页面地址
            badge_detail_url = tool.find_sub_string(
                badge_html, '<a class="badge_row_overlay" href="', '"/>')
            if not badge_detail_url:
                raise crawler.CrawlerException("徽章信息截取徽章详细界面地址失败\n%s" %
                                               badge_html)
            badges_detail_url_list.append(badge_detail_url)
    # ['http://steamcommunity.com/profiles/76561198172925593/gamecards/459820/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/357200/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/502740/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359600/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/354380/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359670/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/525300/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/337980/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/591420/']
    return badges_detail_url_list
Пример #8
0
 def _find_url(self):
     a = PQ(self.html).find('a')
     for i in range(len(a)):
         url = a.eq(i).attr('href')
         if url is not None:
             self._distribute_url(url)
     return None
Пример #9
0
 def __process_item(self):
     all_td = PyQuery(self.__getattribute__('__el')).find('td')
     a_tag = all_td.eq(6).find('a[href^="do_openvpn.aspx?"]')
     if a_tag.length == 0:
         return
     href = a_tag.attr('href').replace('do_openvpn.aspx?', '')
     items = href.split('&')
     server = [
         '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''
     ]
     for item in items:
         props = item.split('=')
         if len(props) < 2:
             continue
         if props[0] == 'fqdn':
             server[0] = props[1].replace('.opengw.net', '')
         elif props[0] == 'ip':
             server[1] = props[1]
         elif props[0] == 'tcp':
             server[15] = props[1]
         elif props[0] == 'udp':
             server[16] = props[1]
     server = self.__fill_other_value(all_td, server)
     # OpenVPN_ConfigData_Base64
     server[14] = self.__get_openvpn_config_base64(items)
     if server[14] is None:
         return  # openvpn_config_base64 is none skip this item
     if self.__getattribute__('__sleep_time') > 0:
         time.sleep(self.__getattribute__('__sleep_time'))
     self.lock.acquire()
     self.__getattribute__('__list_server').append(server)
     self.lock.release()
Пример #10
0
def get_album_page(album_id):
    album_url = "http://www.ugirls.com/Content/List/Magazine-%s.html" % album_id
    album_response = net.http_request(album_url, method="GET")
    result = {
        "image_url_list": [],  # 全部图片地址
        "is_delete": False,  # 是不是已经被删除
        "model_name": "",  # 模特名字
    }
    if album_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(album_response.status))
    if album_response.data.find("该页面不存在,或者已经被删除!") >= 0:
        result["is_delete"] = True
        return result
    # 获取模特名字
    model_name = PQ(album_response.data).find(
        "div.ren_head div.ren_head_c a").attr("title")
    if not model_name:
        raise crawler.CrawlerException("模特信息截取模特名字失败\n%s" %
                                       album_response.data)
    result["model_name"] = model_name.encode("UTF-8").strip()
    # 获取所有图片地址
    image_list_selector = PQ(album_response.data).find("ul#myGallery li img")
    if image_list_selector.length == 0:
        raise crawler.CrawlerException("页面匹配图片地址失败\n%s" % album_response.data)
    for image_index in range(0, image_list_selector.length):
        image_url = image_list_selector.eq(image_index).attr("src")
        if image_url.find("_magazine_web_m.") == -1:
            raise crawler.CrawlerException("图片地址不符合规则\n%s" % image_url)
        result["image_url_list"].append(
            image_url.replace("_magazine_web_m.", "_magazine_web_l."))
    return result
Пример #11
0
def crawl_infected_person_okayama(
        db: Session = Depends(get_db), is_update: bool = False):
    try:
        response = requests.get('https://fight-okayama.jp/attribute/')
        response.encoding = response.apparent_encoding
        doc = PyQuery(response.text.encode('utf-8'))
    except Exception as e:
        return {"exception": e.args}

    for tr_node in doc.find('tbody').children('tr'):
        td_nodes = PyQuery(tr_node)('tr').find('td')
        valid_values = validate_crawled_data(
            **takeout_and_processing_nodes(td_nodes, ("number", "date",
                                                      "residence", "age",
                                                      "sex")))
        if not valid_values:
            if crud.get_mistaken_data_by_number(
                    db=db, number_str=td_nodes.eq(0).text()) is None:
                mistaken_data_dict = takeout_and_processing_nodes(
                    td_nodes, ("number_str", "date_str", "residence_str",
                               "age_str", "sex_str"))
                create_mistaken_data(
                    data=models.MistakenData(**mistaken_data_dict), db=db)
            continue

        # 値が既に存在していた場合、is_updateがTrueであればUPDATE
        if crud.get_data_by_number(db=db,
                                   number=valid_values.number) is not None:
            if is_update:
                update_infected_data(data=valid_values, db=db)
            else:
                return "the crawled data is existing"
        # 値が存在していない場合、値を保存
        else:
            create_infected_data(data=valid_values, db=db)
Пример #12
0
def get_one_page_account(page_count):
    account_pagination_url = "http://jigadori.fkoji.com/users"
    query_data = {"p": page_count}
    account_pagination_response = net.http_request(account_pagination_url,
                                                   method="GET",
                                                   fields=query_data)
    pagination_account_list = {}
    if account_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        crawler.CrawlerException(
            crawler.request_failre(account_pagination_response.status))
    account_list_selector = PQ(account_pagination_response.data.decode(
        "UTF-8")).find(".users-list li")
    for account_index in range(0, account_list_selector.length):
        account_selector = account_list_selector.eq(account_index)
        # 获取成员名字
        account_name = account_selector.find(".profile-name").eq(0).text()
        if not account_name:
            account_name = ""
            # raise robot.CrawlerException("成员信息截取成员名字失败\n\%s" % account_selector.html().encode("UTF-8"))
        else:
            account_name = account_name.strip().encode("UTF-8")
        # 获取twitter账号
        account_id = account_selector.find(".screen-name a").text()
        if not account_id:
            raise crawler.CrawlerException(
                "成员信息截取twitter账号失败\n\%s" %
                account_selector.html().encode("UTF-8"))
        account_id = account_id.strip().replace("@", "")
        pagination_account_list[account_id] = account_name
    return pagination_account_list
Пример #13
0
def get_album_page(sub_path, page_count):
    album_pagination_url = "http://www.88mmw.com/%s/list_%s_%s.html" % (
        sub_path, SUB_PATH_LIST[sub_path], page_count)
    album_pagination_response = net.http_request(album_pagination_url,
                                                 method="GET")
    result = {
        "album_info_list": [],  # 全部图集信息
        "is_over": False,  # 是不是最后一页图集
    }
    if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(album_pagination_response.status))
    # 页面编码
    album_pagination_html = album_pagination_response.data.decode("GBK")
    # 获取图集信息,存在两种页面样式
    album_list_selector = PQ(album_pagination_html).find("div.xxx li a")
    if album_list_selector.length == 0:
        album_list_selector = PQ(album_pagination_html).find("div.yyy li a")
    if album_list_selector.length == 0:
        raise crawler.CrawlerException("页面截取图集列表失败\n%s" %
                                       album_pagination_html.encode("UTF-8"))
    for album_index in range(0, album_list_selector.length):
        result_album_info = {
            "album_title": "",  # 图集id
            "page_id": None,  # 图集页面id
        }
        album_selector = album_list_selector.eq(album_index)
        # 获取图集id
        album_url = album_selector.attr("href")
        if not album_url:
            raise crawler.CrawlerException(
                "图集列表截取图集地址失败\n%s" % album_selector.html().encode("UTF-8"))
        album_id = album_url.split("/")[-2]
        if not crawler.is_integer(album_id):
            raise crawler.CrawlerException("图集地址截取图集id失败\n%s" % str(album_url))
        result_album_info["page_id"] = album_id
        # 获取图集标题
        album_title = album_selector.attr("title").encode("UTF-8")
        if len(re.findall("_共\d*张", album_title)) == 1:
            result_album_info["album_title"] = album_title[:album_title.
                                                           rfind("_共")]
        else:
            result_album_info["album_title"] = album_title
        result["album_info_list"].append(result_album_info)
    # 判断是不是最后一页
    max_page_info = PQ(album_pagination_html).find("div.page a").eq(-1).text()
    if not max_page_info:
        raise crawler.CrawlerException("总页数信息截取失败\n%s" %
                                       album_pagination_html.encode("UTF-8"))
    max_page_count = tool.find_sub_string(max_page_info.encode("UTF-8"), "共",
                                          "页")
    if not crawler.is_integer(max_page_count):
        raise crawler.CrawlerException("总页数截取失败\n%s" %
                                       max_page_info.encode("UTF-8"))
    result["is_over"] = page_count >= int(max_page_count)
    return result
Пример #14
0
def takeout_and_processing_nodes(td_nodes: PyQuery,
                                 keys_name: tuple = ("number", "date",
                                                     "residence", "age",
                                                     "sex")):
    nodes_dict = {}
    for i in range(len(keys_name)):
        takeout_node = td_nodes.eq(i).text()
        processed_node = ''.join(takeout_node.split())
        nodes_dict[keys_name[i]] = processed_node
    return nodes_dict
Пример #15
0
	def getNearby(self,xml):
		locations={'names':[],'types':[]}
		places={'name':'','types':[]}
		placeData=PyQuery(xml.encode('utf-8'))('result')
		for place in placeData:
			place=PyQuery(place)
			name=PyQuery(place('name'))
			types=PyQuery(place('type'))
			locations['names'].append(name.text())
			locations['types'].append(types.eq(0).text())
		return locations
Пример #16
0
def find_task(db, which):
    count = 0

    # 股权出质
    equity_pledged_info = u'equity_pledged_info'

    source_table = "online_crawl_gansu_new"
    for item in db.traverse_batch(source_table):
        data_list = item.get('datalist')
        company = item.get('_id')

        count += 1

        if not isinstance(data_list, dict):
            log.error("{which} table: 没有 datalist company = {company}".format(
                company=company, which=which))
            continue

        if equity_pledged_info not in data_list:
            continue

        value = data_list.get(equity_pledged_info)
        if value is None:
            continue

        if 'detail' in value:
            log.info("{which} table: {equity} company = {company} have detail".format(
                equity=equity_pledged_info, company=company, which=which))
            continue

        if 'list' not in value:
            continue

        list_array = value.get('list')
        if not isinstance(list_array, list) or len(list_array) <= 0:
            continue

        for item0 in list_array:
            text = item0.get('text')
            if text is None:
                continue

            tr_list = PyQuery(text, parser='html').find('#stockTab').find('tr')
            if tr_list.length > 2:
                log.info("{which} table: {equity} company = {company} have list".format(
                    equity=equity_pledged_info, company=company, which=which))
                break
            if tr_list.length == 2 and tr_list.eq(1).find('td').length > 5:
                log.info("{which} table: {equity} company = {company} have list".format(
                    equity=equity_pledged_info, company=company, which=which))
                break

    log.info("查找结束: {which} count = {count}".format(which=which, count=count))
Пример #17
0
def pullSubmissions(subredditName):
    html = urllib2.urlopen("http://reddit.com/r/%s" % subredditName).read()
    storyObjects = PyQuery(html)(".entry")
    for storyObject in [storyObjects.eq(i) for i in range(len(storyObjects))]:
        title = storyObject.find("a.title").html()
        url = storyObject.find("a.title").attr.href
        redditURL = storyObject.find("a.comments").attr.href

        if (
            redditURL
        ):  # advertisement submissions have no comments page and thus the property is None (NOT TRUE ANYMORE //FIXME)
            yield (title, url, redditURL)
Пример #18
0
def get_one_page_photo(page_count):
    photo_pagination_url = "http://kelagirls.com/bizhi!findForIndexMore.action"
    query_data = {"page": page_count}
    photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data)
    result = {
        "image_info_list": [],  # 全部图片地址
        "is_over": False,  # 是不是最后一页壁纸
    }
    if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(crawler.request_failre(photo_pagination_response.status))
    photo_list_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find(".bizhinmore .bizhi")
    if photo_list_selector.length == 0:
        raise crawler.CrawlerException("页面匹配图片列失败\n%s" % photo_pagination_response.data)
    for photo_index in range(0, photo_list_selector.length):
        result_image_info = {
            "image_id": None,  # 图片id
            "image_url": None,  # 图片地址
            "model_name": "",  # 模特名字
        }
        # 获取图片id
        image_id = photo_list_selector.eq(photo_index).find(".bizhibigwrap").attr("id")
        if not image_id:
            raise crawler.CrawlerException("图片列表匹配图片id失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        if not (image_id[0:3] == "big" and crawler.is_integer(image_id[3:])):
            raise crawler.CrawlerException("图片列表匹配的图片id格式不正确\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        result_image_info["image_id"] = str(image_id[3:])
        # 获取图片地址
        image_path = photo_list_selector.eq(photo_index).find(".bizhibig img").eq(1).attr("src")
        if not image_path:
            raise crawler.CrawlerException("图片列表匹配图片地址失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        result_image_info["image_url"] = "http://kelagirls.com/" + str(image_path.encode("UTF-8"))
        # 获取模特名字
        model_name = photo_list_selector.eq(photo_index).find(".bzwdown span").eq(0).text().encode("UTF-8")
        if not model_name:
            raise crawler.CrawlerException("图片列表匹配模特名字失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        result_image_info["model_name"] = str(model_name)
        result["image_info_list"].append(result_image_info)
    # 判断是不是最后一页
    pagination_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find(".pageBottom div")
    max_page_count = page_count
    for pagination_index in range(0, pagination_selector.length):
        if crawler.is_integer(pagination_selector.eq(pagination_index).text()):
            max_page_count = max(max_page_count, int(pagination_selector.eq(pagination_index).text()))
    result["is_over"] = page_count >= max_page_count
    return result
Пример #19
0
def get_one_page_blog(account_id, page_count):
    # http://moexia.lofter.com/?page=1
    blog_pagination_url = "http://blog.sina.com.cn/s/articlelist_%s_0_%s.html" % (account_id, page_count)
    blog_pagination_response = net.http_request(blog_pagination_url, method="GET")
    result = {
        "blog_info_list": [],  # 全部日志地址
        "is_over": False,  # 是不是最后一页
    }
    if blog_pagination_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        if page_count == 1 and blog_pagination_response.data.find("抱歉,您要访问的页面不存在或被删除!") >= 0:
            raise crawler.CrawlerException("账号不存在")
        article_list_selector = PQ(blog_pagination_response.data.decode("UTF-8")).find(".articleList .articleCell")
        if article_list_selector.size() == 0:
            raise crawler.CrawlerException("页面截取日志列表失败\n%s" % blog_pagination_response.data)
        for article_index in range(article_list_selector.size()):
            result_blog_info = {
                "blog_url": None,  # 日志地址
                "blog_time": None,  # 日志时间
                "blog_title": "",  # 日志标题
            }
            article_selector = article_list_selector.eq(article_index)
            # 获取日志地址
            blog_url = article_selector.find("span.atc_title a").attr("href")
            if not blog_url:
                raise crawler.CrawlerException("日志列表解析日志地址失败\n%s" % article_selector.html().encode("UTF-8"))
            result_blog_info["blog_url"] = str(blog_url)
            # 获取日志标题
            blog_title = article_selector.find("span.atc_title a").text().encode("UTF-8")
            if not blog_title:
                raise crawler.CrawlerException("日志列表解析日志标题失败\n%s" % article_selector.html().encode("UTF-8"))
            result_blog_info["blog_title"] = str(blog_title)
            # 获取日志时间
            blog_time = article_selector.find("span.atc_tm").text()
            if not blog_time:
                raise crawler.CrawlerException("日志列表解析日志时间失败\n%s" % article_selector.html().encode("UTF-8"))
            try:
                result_blog_info["blog_time"] = int(time.mktime(time.strptime(blog_time, "%Y-%m-%d %H:%M")))
            except ValueError:
                raise crawler.CrawlerException("日志时间格式不正确\n%s" % blog_time)
            result["blog_info_list"].append(result_blog_info)
        # 获取分页信息
        pagination_html = tool.find_sub_string(blog_pagination_response.data, '<div class="SG_page">', '</div>')
        if not pagination_html:
            result["is_over"] = True
        else:
            max_page_count = tool.find_sub_string(pagination_html, "共", "页")
            if not crawler.is_integer(max_page_count):
                raise crawler.CrawlerException("分页信息截取总页数失败\n%s" % pagination_html)
            result["is_over"] = page_count >= int(max_page_count)
    else:
        raise crawler.CrawlerException(crawler.request_failre(blog_pagination_response.status))
    return result
Пример #20
0
def extractCourse(commentsHTML):
    c = Course()
    table = PyQuery(commentsHTML).find("table.plaintable").children()

    c.id = table.eq(0).text().split()[-1]
    c.subject_code, c.crse = table.eq(1).text().split()[0].split('-')
    c.title = ' '.join(table.eq(1).text().split()[1:])
    c.description = table.eq(2).children().eq(1).text()

    if c.description == "Description Not Found":
        c.description = None

    mtimesPQ = table.eq(3).children().eq(1).children().children()

    if mtimesPQ.eq(1).children().length > 2:
        mtime = MeetingTime()
        mtime.days      = mtimesPQ.eq(1).children().eq(1).text().split()
        mtime.begin     = mtimesPQ.eq(1).children().eq(2).text()
        mtime.end       = mtimesPQ.eq(1).children().eq(3).text()
        mtime.location  = mtimesPQ.eq(1).children().eq(4).text()
        c.exam          = mtimesPQ.eq(1).children().eq(5).text()

        if mtime.days == ['(ARR)']:
            mtime.days = None
            mtime.begin = None
            c.exam = mtime.location
            mtime.location = mtime.end
            mtime.end = None

        c.meeting_times.append(mtime)


        for i in xrange(mtimesPQ.length - 2):   # get additional times
            mtime = MeetingTime()
            mtime.days      = mtimesPQ.eq(i).children().eq(5).text().split()
            mtime.begin     = mtimesPQ.eq(i).children().eq(6).text()
            mtime.end       = mtimesPQ.eq(i).children().eq(7).text()
            mtime.location  = mtimesPQ.eq(i).children().eq(8).text()
            c.meeting_times.append(mtime)

    sectionInfoPQ = table.eq(4).find("table").children().eq(2).children()
    c.instructor = sectionInfoPQ.eq(0).text()
    c.type  = sectionInfoPQ.eq(1).text()
    c.status = sectionInfoPQ.eq(2).text()
    c.capacity = sectionInfoPQ.eq(3).text()

    comments = table.eq(5).children().eq(1).text()
    if comments == 'None':
        c.comments = None
    else:
        c.comments = comments

    return c
Пример #21
0
	def replace_image(self, target, image_name):
		elements = self.html_obj('*').filter('[dzid="' + target + '"]')  
		location = self.location + urllib.quote_plus(image_name)

		for e in elements:
			pq = PyQuery(e)
			if pq.eq(0).is_('img'):
				pq.attr('src', location)
			else:
				pq.css('background-image', 'url("' + location + '");')

			return location

		return None
Пример #22
0
def get_one_page_photo(page_count):
    photo_pagination_url = "http://jigadori.fkoji.com/"
    query_data = {"p": page_count}
    photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data)
    result = {
        "image_info_list": [],  # 全部图片信息
    }
    if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(crawler.request_failre(photo_pagination_response.status))
    photo_list_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find("#wrapper .row .photo")
    for photo_index in range(0, photo_list_selector.size()):
        photo_selector = photo_list_selector.eq(photo_index)
        photo_selector_html = photo_selector.html().encode("UTF-8")
        result_photo_info = {
            "account_name": "",  # twitter账号
            "image_url_list": [],  # 图片地址
            "tweet_id": None,  # tweet id
            "tweet_time": None,  # tweet发布时间
        }
        # 获取tweet id
        tweet_url = photo_selector.find(".photo-link-outer a").eq(0).attr("href")
        if not tweet_url:
            raise crawler.CrawlerException("图片信息截取tweet地址失败\n%s" % photo_selector_html)
        tweet_id = tool.find_sub_string(tweet_url.strip(), "status/")
        if not crawler.is_integer(tweet_id):
            raise crawler.CrawlerException("tweet地址截取tweet id失败\n%s" % tweet_url)
        result_photo_info["tweet_id"] = int(tweet_id)
        # 获取twitter账号
        account_name = photo_selector.find(".user-info .user-name .screen-name").text()
        if not account_name:
            raise crawler.CrawlerException("图片信息截取twitter账号失败\n%s" % photo_selector_html)
        result_photo_info["account_name"] = str(account_name).strip().replace("@", "")
        # 获取tweet发布时间
        tweet_time = photo_selector.find(".tweet-text .tweet-created-at").text().strip()
        if not tweet_time:
            raise crawler.CrawlerException("图片信息截取tweet发布时间失败\n%s" % photo_selector_html)
        try:
            result_photo_info["tweet_time"] = int(time.mktime(time.strptime(str(tweet_time).strip(), "%Y-%m-%d %H:%M:%S")))
        except ValueError:
            raise crawler.CrawlerException("tweet发布时间文本格式不正确\n%s" % tweet_time)
        # 获取图片地址
        image_list_selector = photo_selector.find(".photo-link-outer a img")
        for image_index in range(0, image_list_selector.size()):
            image_url = image_list_selector.eq(image_index).attr("src")
            if not image_url:
                raise crawler.CrawlerException("图片列表截取图片地址失败\n%s" % image_list_selector.eq(image_index).html())
            result_photo_info["image_url_list"].append(str(image_url).strip())
        result["image_info_list"].append(result_photo_info)
    return result
Пример #23
0
def get_one_page_album(account_id, page_count):
    # http://bcy.net/u/50220/post/cos?&p=1
    album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id
    query_data = {"p": page_count}
    album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data)
    result = {
        "album_info_list": [],  # 全部作品信息
        "coser_id": None,  # coser id
        "is_over": False,  # 是不是最后一页作品
    }
    if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status))
    if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0:
        raise crawler.CrawlerException("账号不存在")
    # 获取coser id
    coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data)
    if len(coser_id_find) != 1:
        raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data)
    if not crawler.is_integer(coser_id_find[0]):
        raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data)
    result["coser_id"] = coser_id_find[0]
    # 获取作品信息
    album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item")
    for album_index in range(0, album_list_selector.size()):
        album_selector = album_list_selector.eq(album_index)
        result_album_info = {
            "album_id": None,  # 作品id
            "album_title": None,  # 作品标题
        }
        # 获取作品id
        album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href")
        if not album_url:
            raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8"))
        album_id = str(album_url).split("/")[-1]
        if not crawler.is_integer(album_id):
            raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8")))
        result_album_info['album_id'] = album_id
        # 获取作品标题
        album_title = album_selector.find(".postWorkCard__img img").attr("alt")
        result_album_info["album_title"] = str(album_title.encode("UTF-8"))
        result["album_info_list"].append(result_album_info)
    # 判断是不是最后一页
    last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a")
    if last_pagination_selector.size() == 1:
        max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1])
        result["is_over"] = page_count >= max_page_count
    else:
        result["is_over"] = True
    return result
Пример #24
0
def scrape_grants_for_fy(year):
    b.open(PAST_GRANTS_URL)

    try:
        b.select_form(name="Form1")

        b["oUcStartDate$ddlDay"] = ["1"]
        b["oUcStartDate$ddlMonth"] = ["4"]
        b["oUcStartDate$ddlYear"] = [str(year)]

        b["oUcEndDate$ddlDay"] = ["31"]
        b["oUcEndDate$ddlMonth"] = ["3"]
        b["oUcEndDate$ddlYear"] = [str(year + 1)]

        resp = b.submit()
    except mechanize._form.ItemNotFoundError:
        print("ERROR: could not submit form. This usually means you're "
              "trying to scrape for a year that doesn't exist "
              "on the GOTW website.", file=sys.stderr)
        raise

    page = PyQuery(resp.read())

    for r in page("table tr:not(.GridHeader)"):
        grant = {}
        anchors = PyQuery(r).find('a')

        grant['id'] = anchors.eq(0).attr.title
        grant['title'] = anchors.eq(0).text()

        grant['pi'] = pi = {}
        pi['id'] = util.extract_id(anchors.eq(1).attr.href, 'Person')
        pi['name'] = anchors.eq(1).text()

        grant['organisation'] = org = {}
        org['id'] = util.extract_id(anchors.eq(2).attr.href, 'Organisation')
        org['name'] = anchors.eq(2).text()

        grant['department'] = dept = {}
        dept['id'] = util.extract_id(anchors.eq(3).attr.href, 'Department')
        dept['name'] = anchors.eq(3).text()

        value = PyQuery(r).find('span').eq(0).attr.title
        grant['value'] = util.extract_monetary_value(value)

        yield grant
Пример #25
0
def query_wubi(char):

    url = 'http://www.chaiwubi.com/bmcx/'

    data = {'wz': char, 'select_value': '查单字'}

    r = requests.post(url, data=data)

    h = html.fromstring(r.text)
    tb = h.cssselect('.dw-bmcx')[0]

    # 五笔王码86版
    # 大一统新世纪版五笔编码
    d = defaultdict(list)
    trs = tb.cssselect('tr')

    # 取前三个

    tr86 = trs[0]
    tds = PyQuery(tr86).children('td')
    d['86'] = [
        tds.eq(2).text().strip() or None,
        tds.eq(3).text().strip() or None,
        tds.eq(4).text().strip() or None,
        tds.eq(5).text().strip() or None,
    ]
    for tr in trs[1:3]:
        tds = PyQuery(tr).children('td')
        d[tds.eq(0).text()] = [
            tds.eq(1).text().strip() or None,
            tds.eq(2).text().strip() or None,
            tds.eq(3).text().strip() or None,
            tds.eq(4).text().strip() or None,
        ]

    return d
Пример #26
0
def get_album_photo(sub_path, page_id):
    page_count = 1
    result = {
        "image_url_list": [],  # 全部图片地址
    }
    while True:
        if page_count == 1:
            photo_pagination_url = "http://www.88mmw.com/%s/%s" % (sub_path,
                                                                   page_id)
        else:
            photo_pagination_url = "http://www.88mmw.com/%s/%s/index_%s.html" % (
                sub_path, page_id, page_count)
        photo_pagination_response = net.http_request(photo_pagination_url,
                                                     method="GET")
        if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "第%s页 " % page_count +
                crawler.request_failre(photo_pagination_response.status))
        # 页面编码
        photo_pagination_html = photo_pagination_response.data.decode("GBK")
        # 获取图片地址
        image_list_selector = PQ(photo_pagination_html).find("div.zzz li img")
        if image_list_selector.length == 0:
            raise crawler.CrawlerException(
                "第%s页 页面匹配图片地址失败\n%s" %
                (page_count, photo_pagination_html.encode("UTF-8")))
        for image_index in range(0, image_list_selector.length):
            result["image_url_list"].append(
                "http://www.88mmw.com" +
                str(image_list_selector.eq(image_index).attr("src")).replace(
                    "-lp", ""))
        # 判断是不是最后一页
        is_over = False
        max_page_selector = PQ(photo_pagination_html).find("div.page").eq(
            0).find("span strong").text()
        if not max_page_selector:
            is_over = True
        elif crawler.is_integer(max_page_selector):
            is_over = page_count >= int(max_page_selector)
        if is_over:
            break
        else:
            page_count += 1
    return result
Пример #27
0
def get_account_talks(account_id, account_name, talk_list):
    account_index = "https://7gogo.jp/users/%s" % account_id
    account_index_response = net.http_request(account_index, method="GET")
    if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(account_index_response.status))
    talk_list_selector = PQ(account_index_response.data.decode("UTF-8")).find(
        ".UserTalkWrapper .UserTalk")
    for talk_index in range(0, talk_list_selector.size()):
        talk_selector = talk_list_selector.eq(talk_index)
        # 获取talk地址
        talk_url_path = talk_selector.attr("href")
        if not talk_url_path:
            raise crawler.CrawlerException("talk信息截取talk地址失败\n%s" %
                                           talk_selector.html.encode("UTF-8"))
        talk_id = str(talk_url_path.replace("/", ""))
        if not talk_id:
            raise crawler.CrawlerException("talk地址截取talk id失败\n%s" %
                                           talk_url_path)
        # 获取talk名字
        talk_name = talk_selector.find(".UserTalk__talkname").text()
        if not talk_name:
            raise crawler.CrawlerException("talk信息截取talk名字失败\n%s" %
                                           talk_selector.html.encode("UTF-8"))
        talk_name = crawler.filter_emoji(
            str(talk_name.encode("UTF-8")).strip())
        # 获取talk描述
        talk_description = crawler.filter_emoji(
            talk_selector.find(".UserTalk__description").text())
        if talk_description:
            talk_description = crawler.filter_emoji(
                str(talk_description.encode("UTF-8")).strip())
        else:
            talk_description = ""
        if talk_id in talk_list:
            talk_list[talk_id]["account_list"].append(account_name)
        else:
            talk_list[talk_id] = {
                "account_list": [account_name],
                "talk_name": talk_name,
                "talk_description": talk_description,
            }
        output.print_msg(account_id + ": " + talk_name + ", " +
                         talk_description)
Пример #28
0
    def searchStatus(self, keyword, max_length=20):
        url = 'http://browse.renren.com/s/status?offset=0&sort=1&range=0&q=%s&l=%d' % (keyword, max_length)
        r = self.session.get(url, timeout=5)
        status_elements = PyQuery(r.text)('.list_status .status_content')
        id_pattern  = re.compile("forwardDoing\('(\d+)','(\d+)'\)")
        results = []
        for index, _ in enumerate(status_elements):
            status_element = status_elements.eq(index)

            # 跳过转发的
            if status_element('.status_root_msg'):
                continue

            status_element = status_element('.status_content_footer')
            status_time = status_element('span').text()
            m = id_pattern.search(status_element('.share_status').attr('onclick'))
            status_id, user_id = m.groups()
            results.append( (int(user_id), int(status_id), status_time) )
        return results
Пример #29
0
    def search_status(self, keyword, max_length=20):
        url = 'http://browse.renren.com/s/status?offset=0&sort=1&range=0&q=%s&l=%d' % (keyword, max_length)
        r = self.session.get(url, timeout=5)
        status_elements = PyQuery(r.text)('.list_status .status_content')
        id_pattern  = re.compile("forwardDoing\('(\d+)','(\d+)'\)")
        results = []
        for index, _ in enumerate(status_elements):
            status_element = status_elements.eq(index)

            # 跳过转发的
            if status_element('.status_root_msg'):
                continue

            status_element = status_element('.status_content_footer')
            status_time = status_element('span').text()
            m = id_pattern.search(status_element('.share_status').attr('onclick'))
            status_id, user_id = m.groups()
            results.append( (int(user_id), int(status_id), status_time) )
        return results
Пример #30
0
def qichacha_search_result(j: PyQuery) -> dict:
    j = j.children()
    td_row = j.eq(2)
    company_name = td_row.children('a').text()
    p_first = td_row.children('p').eq(0)
    legal_representative = p_first.children('a').text()
    span_m_l = p_first("span:first").text().split(':')
    registered_capital = span_m_l[-1].strip('-')
    span_m_ls = p_first('span:last').text().split(':')
    date_of_establishment = span_m_ls[-1]
    p_two = td_row('p').eq(-3)
    p_obj = p_two.clone()
    p_obj.children().remove()
    email = p_obj.text().split(':')[-1].strip('-')
    phone = p_two.find('span').text().split(':')[-1].strip(' ').strip('-')
    register_address = td_row.find('p').eq(2).text().split(':')[-1]
    return dict(company_name=company_name,
                legal_representative=legal_representative,
                registered_capital=registered_capital,
                date_of_establishment=date_of_establishment,
                email=email,
                phone=phone,
                register_address=register_address)
Пример #31
0
def get_account_index_page(account_name):
    account_index_url = "http://%s.pp.163.com/" % account_name
    account_index_response = net.http_request(account_index_url, method="GET")
    result = {
        "album_url_list": [],  # 全部相册地址
    }
    if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(account_index_response.status))
    # 页面编码
    account_index_html = account_index_response.data.decode("GBK").encode(
        "UTF-8")
    if account_index_html.find("<title>该页面不存在</title>") >= 0:
        raise crawler.CrawlerException("账号不存在")
    # 获取全部相册地址
    album_result_selector = PQ(account_index_html).find("#p_contents li")
    if album_result_selector.size() == 0:
        raise crawler.CrawlerException("页面匹配相册列表失败\n%s" % account_index_html)
    for album_index in range(0, album_result_selector.size()):
        result["album_url_list"].append(
            str(
                album_result_selector.eq(album_index).find("a.detail").attr(
                    "href")))
    return result
Пример #32
0
def extract(dom, param_dict):
    res = []
    # dom head
    head_list = str(param_dict['dom_head']).strip().split(',')
    d_divs = dom(head_list[0])

    if len(head_list) > 1:
        for pos in range(1, len(head_list)):
            try:
                value = int(head_list[pos])
                d_divs = d_divs.eq(value)
            except:
                d_divs = d_divs.children(head_list[pos])

    for div in d_divs:
        d_div = PyQuery(div)
        if param_dict['sandwich'] != 'None':
            sandwich_list = str(param_dict['sandwich']).strip().split(',')
            for sandwich in sandwich_list:
                try:
                    positon = int(sandwich)
                    d_div = d_div.eq(positon)
                except:
                    d_div = d_div.children(sandwich)

        header = str(param_dict['title']).strip().split(',')[0]
        if not d_div.children(header):
            continue

        # 获取url 信息
        url_list = str(param_dict['url']).strip().split(',')
        url = d_div.children(url_list[0])
        for pos in range(1, len(url_list)):
            try:
                n_url = int(url_list[pos])
                url = url.eq(n_url)
            except:
                if url_list[pos] == 'href':
                    url = url.attr('href')
                    break
                else:
                    url = url.children(url_list[pos])

        # join url
        if 'www' not in url and 'http' not in url:
            match = re.search('^/', url)
            if match:
                url = param_dict['domain'] + url
            else:
                url = param_dict['domain'] + '/' + url

        if 'http://' not in url:
            url = 'http://' + url

        # 获取title 信息
        title_list = str(param_dict['title']).strip().split(',')
        title = d_div
        for item in title_list:
            try:
                n_title = int(item)
                title = title.eq(n_title)
            except:
                title = title.children(item)

        title = title.text()
        date_list = str(param_dict['date']).strip().split(',')
        date = d_div
        is_attr = False
        for item in date_list:
            try:
                n_item = int(item)
                date = date.eq(n_item)
            except:
                if 'attr' not in item:
                    date = date.children(item)
                else:
                    item = item[:item.find(':')]
                    date = date.attr(item)[:20].strip()
                    is_attr = True

        date = date if is_attr else date.text()
        if ' / ' in date: date = date.replace(' / ', '-')
        if '/' in date: date = date.replace('/', '-')

        if re.search(u'\d{4}-\d{1,2}-\d{1,2}', date):
            date = ''.join(x for x in date if ord(x) < 256).strip()
            start_index = date.rfind('201')  #第一次出现的位置
            end_index1 = date.rfind('-')
            end_index2 = date.rfind(':')
            end_index = end_index1 if end_index1 > end_index2 else end_index2
            date = date[start_index:end_index + 3]
            if len(date) == 10:
                date = '%s %s' % (
                    date, time.strftime("%H:%M", time.localtime(time.time())))
        elif re.search(u'\d{1,2}-\d{1,2}-\d{4} \d{1,2}:\d{1,2}:\d{1,2}', date):
            arr_time = date.split(' ')
            arr_date = arr_time[0].split('-')
            date = '%s-%s-%s %s' % (arr_date[2], arr_date[0], arr_date[1],
                                    arr_time[1])
        else:
            try:
                # 时间戳转化成日期
                date_stamp = int(date)
                if date_stamp > 9999999999:
                    date_stamp = int(date[:10])

                x = time.localtime(date_stamp)
                date = time.strftime('%Y-%m-%d %H:%M', x)
            except:
                date = fomate_date_output(date)

        date = format_date_time(date)
        if len(date) == 16:
            if cmp(date, str_today) >= 0 and cmp(
                    date, end_today) <= 0 and len(title) > 0:
                res.append([date, url, title])

    return res
Пример #33
0
#!/usr/bin/env python

import requests
from pyquery import PyQuery

USER_AGENT = 'Mozilla/5.0'
CG_URL = 'http://www.cordobaguias.com.ar/cotizacion-dolar-en-cordoba.html'
PDB_URL = 'http://www.preciodolarblue.com.ar'
AMBITO_URL = 'http://www.ambito.com/economia/mercados/monedas/dolar/info/?ric=ARSB=C'

headers = {'User-Agent': USER_AGENT}

cg = PyQuery(requests.get(CG_URL, headers=headers).content).find('.cuadroPrecioD').text().replace(' pesos', '')
pdb_tds = PyQuery(requests.get(PDB_URL, headers=headers).content).find('td')
pdb = (pdb_tds.eq(3).text(), pdb_tds.eq(4).text())
ambito = PyQuery(requests.get(AMBITO_URL, headers=headers).content)

print 'cordobaguias | preciodolarblue'
print '-' * 30
print '%s | %s' % (cg, ' / '.join(pdb))
print 'Cueva (Ambito): %.2f | %.2f' % (float(ambito.find('#compra>big').text().replace(',', '.')),
                                   float(ambito.find("#venta>big").text().replace(',', '.')))

Пример #34
0
def get_one_page_audio(account_id, page_count):
    # http://www.ximalaya.com/1014267/index_tracks?page=2
    audit_pagination_url = "http://www.ximalaya.com/%s/index_tracks" % account_id
    query_data = {"page": page_count}
    audit_pagination_response = net.http_request(audit_pagination_url,
                                                 method="GET",
                                                 fields=query_data,
                                                 json_decode=True)
    result = {
        "audio_info_list": [],  # 页面解析出的歌曲信息列表
        "is_over": False,  # 是不是最后一页
    }
    if audit_pagination_response.status == 404:
        raise crawler.CrawlerException("账号不存在")
    elif audit_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(audit_pagination_response.status))
    if not crawler.check_sub_key(
        ("res", "html"), audit_pagination_response.json_data):
        raise crawler.CrawlerException("返回数据'res'或'html'字段不存在\n%s" %
                                       audit_pagination_response.json_data)
    if audit_pagination_response.json_data["res"] is not True:
        raise crawler.CrawlerException("返回数据'res'字段取值不正确\n%s" %
                                       audit_pagination_response.json_data)
    # 获取歌曲信息
    audio_list_selector = PQ(audit_pagination_response.json_data["html"]).find(
        "ul.body_list li.item")
    for audio_index in range(0, audio_list_selector.size()):
        audio_info = {
            "audio_id": None,  # 页面解析出的歌曲id
            "audio_title": "",  # 页面解析出的歌曲标题
        }
        audio_selector = audio_list_selector.eq(audio_index)
        # 获取歌曲id
        audio_id = audio_selector.find(".content_wrap").attr("sound_id")
        if not crawler.is_integer(audio_id):
            raise crawler.CrawlerException(
                "歌曲信息匹配歌曲id失败\n%s" %
                audio_list_selector.html().encode("UTF-8"))
        audio_info["audio_id"] = str(audio_id)
        # 获取歌曲标题
        audio_title = audio_selector.find(".sound_title").attr("title")
        if not audio_title:
            raise crawler.CrawlerException(
                "歌曲信息匹配歌曲标题失败\n%s" %
                audio_list_selector.html().encode("UTF-8"))
        audio_info["audio_title"] = str(audio_title.encode("UTF-8").strip())
        result["audio_info_list"].append(audio_info)
    # 判断是不是最后一页
    max_page_count = 1
    pagination_list_selector = PQ(
        audit_pagination_response.json_data["html"]).find(
            ".pagingBar_wrapper a.pagingBar_page")
    for pagination_index in range(0, pagination_list_selector.size()):
        pagination_selector = pagination_list_selector.eq(pagination_index)
        data_page = pagination_selector.attr("data-page")
        if data_page is None:
            continue
        if not crawler.is_integer(data_page):
            raise crawler.CrawlerException(
                "分页信息匹配失败\n%s" % audio_list_selector.html().encode("UTF-8"))
        max_page_count = max(max_page_count, int(data_page))
    result["is_over"] = page_count >= max_page_count
    return result
Пример #35
0
#!/usr/bin/env python

import requests
from pyquery import PyQuery

USER_AGENT = 'Mozilla/5.0'
CG_URL = 'http://www.cordobaguias.com.ar/cotizacion-dolar-en-cordoba.html'
PDB_URL = 'http://www.preciodolarblue.com.ar'
AMBITO_URL = 'http://www.ambito.com/economia/mercados/monedas/dolar/info/?ric=ARSB=C'

headers = {'User-Agent': USER_AGENT}

cg = PyQuery(requests.get(
    CG_URL, headers=headers).content).find('.cuadroPrecioD').text().replace(
        ' pesos', '')
pdb_tds = PyQuery(requests.get(PDB_URL, headers=headers).content).find('td')
pdb = (pdb_tds.eq(3).text(), pdb_tds.eq(4).text())
ambito = PyQuery(requests.get(AMBITO_URL, headers=headers).content)

print 'cordobaguias | preciodolarblue'
print '-' * 30
print '%s | %s' % (cg, ' / '.join(pdb))
print 'Cueva (Ambito): %.2f | %.2f' % (float(
    ambito.find('#compra>big').text().replace(
        ',', '.')), float(ambito.find("#venta>big").text().replace(',', '.')))
Пример #36
0
def get_one_page_favorite(page_count):
    # http://www.weibo.com/fav?page=1
    favorite_pagination_url = "http://www.weibo.com/fav"
    query_data = {"page": page_count}
    cookies_list = {"SUB": COOKIE_INFO["SUB"]}
    favorite_pagination_response = net.http_request(favorite_pagination_url,
                                                    method="GET",
                                                    fields=query_data,
                                                    cookies_list=cookies_list)
    result = {
        "blog_info_list": [],  # 所有微博信息
        "is_error": False,  # 是不是不符合格式
        "is_over": False,  # 是不是最后一页收藏
    }
    if favorite_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(favorite_pagination_response.status))
    favorite_data_html = tool.find_sub_string(
        favorite_pagination_response.data,
        '"ns":"pl.content.favoriteFeed.index"', '"})</script>', 2)
    favorite_data_html = tool.find_sub_string(favorite_data_html, '"html":"',
                                              '"})')
    if not favorite_data_html:
        raise crawler.CrawlerException("页面截取收藏信息失败\n%s" % favorite_data_html)
    # 替换全部转义斜杠以及没有用的换行符等
    html_data = favorite_data_html.replace("\\\\", chr(1))
    for replace_string in ["\\n", "\\r", "\\t", "\\"]:
        html_data = html_data.replace(replace_string, "")
    html_data = html_data.replace(chr(1), "\\")
    # 解析页面
    children_selector = PQ(
        html_data.decode("UTF-8")).find('div.WB_feed').children()
    if children_selector.length == 0:
        raise crawler.CrawlerException("匹配收藏信息失败\n%s" % favorite_data_html)
    if children_selector.length == 1:
        raise crawler.CrawlerException("没有收藏了")
    # 解析日志id和图片地址
    for i in range(0, children_selector.length - 1):
        feed_selector = children_selector.eq(i)
        # 已被删除的微博
        if not feed_selector.has_class("WB_feed_type"):
            continue
        result_blog_info = {
            "blog_id": None,  # 日志id(mid)
            "image_url_list": [],  # 所有图片地址
        }
        # 解析日志id
        blog_id = feed_selector.attr("mid")
        if not crawler.is_integer(blog_id):
            raise crawler.CrawlerException(
                "收藏信息解析微博id失败\n%s" % feed_selector.html().encode("UTF-8"))
        result_blog_info["blog_id"] = str(blog_id)
        # WB_text       微博文本
        # WB_media_wrap 微博媒体(图片)
        # .WB_feed_expand .WB_expand     转发的微博,下面同样包含WB_text、WB_media_wrap这些结构
        # 包含转发微博
        if feed_selector.find(".WB_feed_expand .WB_expand").length == 0:
            media_selector = feed_selector.find(".WB_media_wrap")
        else:
            media_selector = feed_selector.find(
                ".WB_feed_expand .WB_expand .WB_media_wrap")
        # 如果存在媒体
        if media_selector.length == 1:
            thumb_image_url_list = re.findall('<img src="([^"]*)"/>',
                                              media_selector.html())
            if len(thumb_image_url_list) > 0:
                image_url_list = []
                for image_url in thumb_image_url_list:
                    temp_list = image_url.split("/")
                    temp_list[3] = "large"
                    image_url_list.append("http:" + str("/".join(temp_list)))
                result_blog_info["image_url_list"] = image_url_list
        if len(result_blog_info["image_url_list"]) > 0:
            result["blog_info_list"].append(result_blog_info)
    # 最后一条feed是分页信息
    page_selector = children_selector.eq(children_selector.length - 1)
    # 判断是不是最后一页
    page_count_find = re.findall("第(\d*)页",
                                 page_selector.html().encode("UTF-8"))
    if len(page_count_find) > 0:
        page_count_find = map(int, page_count_find)
        result["is_over"] = page_count >= max(page_count_find)
    else:
        result["is_over"] = True
    return result
Пример #37
0
    def get_chattel_mortgage_info_detail(self, onclick, detail_list):

        result = dict()

        if onclick is None or onclick.strip() == '':
            return result

        temp_list = onclick.split(u'\'')
        if temp_list is None or len(temp_list) < 2:
            return result

        temp_list = temp_list[1].split(u'\'')
        if temp_list is None or len(temp_list) <= 0:
            return result

        morreg_id = temp_list[0]

        # 遍历所有页面
        for detail in detail_list:
            url = detail.get('url')
            if not isinstance(url, basestring):
                continue

            if morreg_id not in url:
                continue

            text = detail.get('text')
            if not isinstance(text, basestring) or text.strip() == u'':
                continue

            table_list = PyQuery(text, parser='html').find('.detailsList')
            if table_list is None or table_list.length < 5:
                raise FieldMissError

            # 动产抵押登记信息
            td_list = table_list.eq(0).find('td')
            cm_dict = dict()
            result[GsModel.ChattelMortgageInfo.ChattelDetail.
                   CHATTEL_MORTGAGE] = cm_dict
            cm_dict[GsModel.ChattelMortgageInfo.ChattelDetail.ChattelMortgage.
                    REGISTER_NUM] = td_list.eq(0).text()
            cm_dict[GsModel.ChattelMortgageInfo.ChattelDetail.ChattelMortgage.
                    REGISTER_DATE] = td_list.eq(1).text()
            cm_dict[GsModel.ChattelMortgageInfo.ChattelDetail.ChattelMortgage.
                    REGISTER_OFFICE] = td_list.eq(2).text()

            # 抵押权人概况信息
            tr_list = table_list.eq(1).find('tr').items()
            mps_list = list()
            result[GsModel.ChattelMortgageInfo.ChattelDetail.
                   MORTGAGE_PERSON_STATUS] = mps_list
            for tr in tr_list:
                td_list = tr.find('td')
                if td_list is None or td_list.length < 5:
                    continue

                item = dict()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.
                     MortgagePersonStatus.MORTGAGE_PERSON_NAME] = td_list.eq(
                         1).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.
                     MortgagePersonStatus.CERTIFICATE_TYPE] = td_list.eq(
                         2).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.
                     MortgagePersonStatus.CERTIFICATE_NUM] = td_list.eq(
                         3).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.
                     MortgagePersonStatus.ADDRESS] = td_list.eq(4).text()
                mps_list.append(item)

            # 被担保债权概况信息
            td_list = table_list.eq(2).find('td')
            gps_dict = dict()
            result[GsModel.ChattelMortgageInfo.ChattelDetail.
                   GUARANTEED_PERSON_STATUS] = gps_dict
            gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail.
                     GuaranteedPersonStatus.KIND] = td_list.eq(0).text()
            gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail.
                     GuaranteedPersonStatus.AMOUNT] = td_list.eq(1).text()
            gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail.
                     GuaranteedPersonStatus.SCOPE] = td_list.eq(2).text()
            gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail.
                     GuaranteedPersonStatus.PERIOD] = td_list.eq(3).text()
            gps_dict[GsModel.ChattelMortgageInfo.ChattelDetail.
                     GuaranteedPersonStatus.REMARK] = td_list.eq(4).text()

            # 抵押物概况信息
            tr_list = table_list.eq(3).find('tr').items()
            gs_list = list()
            result[GsModel.ChattelMortgageInfo.ChattelDetail.
                   GUARANTEE_STATUS] = gs_list
            for tr in tr_list:
                td_list = tr.find('td')
                if td_list is None or td_list.length < 5:
                    continue

                item = dict()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus.
                     NAME] = td_list.eq(1).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus.
                     AFFILIATION] = td_list.eq(2).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus.
                     SITUATION] = td_list.eq(3).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.GuaranteeStatus.
                     REMARK] = td_list.eq(4).text()
                gs_list.append(item)

            # 变更信息
            tr_list = table_list.eq(4).find('tr').items()
            change_list = list()
            result[GsModel.ChattelMortgageInfo.ChattelDetail.
                   CHANGE_INFO] = change_list

            for tr in tr_list:
                td_list = tr.find('td')
                if td_list is None or td_list.length < 3:
                    continue

                item = dict()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.ChangeInfo.
                     CHANGE_DATE] = td_list.eq(1).text()
                item[GsModel.ChattelMortgageInfo.ChattelDetail.ChangeInfo.
                     CHANGE_CONTENT] = td_list.eq(2).text()

            break

        return result
Пример #38
0
    batch_mode = True

cfg=configparser.RawConfigParser()
cfg.read(os.path.expanduser('~/secured/myukrsib.cfg'))
smtp_host = cfg.get('default','smtp_host')
smtp_user = cfg.get('default','smtp_user')
smtp_secret = cfg.get('default','smtp_secret')



#fname = sys.argv[len(sys.argv)-1]
#f = open(fname,'rb')
q = PyQuery(sys.stdin.read())

tbls = PyQuery(q('form#cardAccountInfoForm').children('table'))
t = tbls.eq(0)('td').eq(1).text().split(':')
available_amount = t[2]
global_own_amount = t[1].split()[0]
t = tbls.eq(2)('td').eq(0).text().split(':')
overdraft = t[1]
t = tbls.eq(2)('td').eq(1).text().split(':')
replenishment = t[1]
t = tbls.eq(2)('td').eq(4).text().split(':')
own_amount = t[1]
t = tbls.eq(2)('td').eq(5).text().split(':')
withdrawal = t[1]

account_ops = []
card_ops = []
holds = []