def get_provider_info(content, element_num):
    total_provider_num = 0
    total_golden_product = 0
    jq_pa = pQuery(content)
    num_arr_html = jq_pa('#ContentPlaceHolder1_ProductSupplier').nextAll()
    num_arr = num_arr_html('font')
    page_num = num_arr.length
    place_holder_1_pa = jq_pa('table.ProdGN_4')
    font_pa = place_holder_1_pa('font')
    font_text_pa = pQuery(font_pa).text()

    total_golden_product = font_text_pa.count(u'黄金产品') + total_golden_product
    total_provider_num = place_holder_1_pa.length + total_provider_num
    if page_num > 1:
        for i_page_num in range(1, page_num):
            url = get_provider_page_uri(element_num, i_page_num)
            # print url
            content = get_content(url, running_state)
            jq = pQuery(content)
            place_holder_1 = jq('table.ProdGN_4')
            font = place_holder_1('font')
            font_text = pQuery(font).text()
            total_golden_product = font_text.count(u'黄金产品') + total_golden_product
            total_provider_num = place_holder_1.length + total_provider_num
        return [total_provider_num, total_golden_product]
    else:
        return [total_provider_num, total_golden_product]
예제 #2
0
 def itemGetResource(self, itemQuey):
     imageListQuey = self.sphinx.getQuey('http://www.pixiv.net/' + itemQuey.find('._work').attr('href'))
     downList = imageListQuey.find('.item-container .image')
     for imageRow in downList:
         imageData = pQuery(imageRow)
         imageSrc = imageData.attr('data-src')
         self.itemData['resource'].append(imageSrc)
예제 #3
0
def get_resource_id(course):
    data = []
    for name, url in course:
        logging.info('process {0}, {1}'.format(name, url))
        html = requests.get(base_url + url).content
        for rid in (pQuery(html)('#vlink_1 ul li input')):
            data.append((name, rid.attrib['value']))
    return data
예제 #4
0
def find_page_resource(pattern, page):
    data = []
    for link in pQuery(page)('li.gxbox ul li.l1 a'):
        # print(link.attrib['href'] + " " + link.attrib['alt'])
        name = link.attrib['alt']
        if pattern.search(name):
            logging.info('find course {0}'.format(name))
            data.append((name, link.attrib['href']))
    return data
예제 #5
0
    def pieces(self):
        pageQuery = self.sphinx.getQuey(self.url)

        def piecesGenerator(piecesSet):
            for pieceUrl in piecesSet:
                pieceObj = Piece()
                pieceObj.info = self.info
                pieceObj.url = pieceUrl
                yield pieceObj

        piecesSet = set([self.info['host'] + pQuery(piece).attr('href') for piece in
                         pageQuery.find('._image-items .image-item .work')])
        return piecesGenerator(piecesSet)
예제 #6
0
파일: dict.py 프로젝트: jiahut/tools
def lookup(*words):
  uri = "http://dict.baidu.com/s?wd=%(word)s"
  _word = ""

  logger.info(words)
  for word in words:
    _word += word + ' '
  _word = _word[:-1]
  word = {"word" : _word}
  url = uri % word
  logger.info(url)
  #doc = urllib.urlopen(url).read()
  doc =  pQuery(url=url)
  explain = doc('#en-simple-means>div').eq(0).find('p').text()
  print(explain)
예제 #7
0
def auto_checknovelupdate():
    url = "http://www.biquge.la/book/14/"
    # url = "http://www.baidu.com"
    # 抓取网页
    page = urllib2.urlopen(url)
    # 解码
    text = unicode(page.read(), "gbk")
    # print text

    # 转成jQuery对象
    jQuery = pQuery(text)

    # 取出页面元素
    title = jQuery("#info>h1").html()
    updatetime = jQuery("#info>p:eq(2)").text()
    chapter = jQuery("#info>p:eq(3)>a").text()

    # 提取更新时间
    strTime = u"最后更新:"
    pos_start = updatetime.find(strTime)
    time_len = len(strTime)
    pos_start = pos_start + time_len
    updatetime = updatetime[pos_start:]

    # 字符串转成时间
    t = datetime.datetime.strptime(updatetime, "%Y-%m-%d %H:%M")
    updatetime = t.strftime('%Y-%m-%d %H:%M:%S')
    # 更新数据库
    bUpdate = update_db_row(title, updatetime, chapter)

    smstext = ""
    if bUpdate:
        # 拼接短信内容
        smstext = u"【赢创天下科技】[%s]更新了,最后更新:%s,最新章节:%s" % (title, updatetime,
                                                        chapter)
        write_log(LOG_FILE_NAME, smstext)
        send_smd(smstext)
    else:
        logtext = "--------不发送短信--------"
        write_log(LOG_FILE_NAME, logtext)

    return smstext
예제 #8
0
    def analyzer(self):
        pageNext = True
        pageUrl = self._buildUrl(self.url, {
            'p': 1
        })

        def pageListGenerator(pageSet):
            for pageUrl in pageSet:
                pageObj = self.__class__(pageUrl)
                pageObj.info = self.info
                pageObj.url = pageUrl
                yield pageObj

        while (pageNext):
            pageSet = set([])
            listQuery = self.sphinx.getQuey(pageUrl)
            pageUrlData = self._queryUrl(pageUrl)
            currentPage = listQuery.find('.column-order-menu:eq(0) .page-list .current').html()
            if ((pageUrlData['query']['p'] == currentPage) or currentPage is None):
                pageSet.add(pageUrl)

            pageList = listQuery.find('.column-order-menu:eq(0) .page-list')
            pageListPages = pageList.find('a')
            for pageItem in pageListPages:
                pageSet.add(self.info['baseUrl'] + pQuery(pageItem).attr('href'))

            def pageSort(pageLink):
                return int(self._queryUrl(pageUrl)['query']['p'])

            pageSet = sorted(pageSet, key=pageSort)
            yield pageListGenerator(pageSet)
            if (len(pageListPages) >= 8):
                lastPage = self.info['baseUrl'] + listQuery.find(
                        '.column-order-menu:eq(0) .page-list li:last a').attr(
                        'href')
                pageUrl = self._buildUrl(lastPage, {
                    'p': int(self._queryUrl(lastPage)['query']['p']) + 5
                })
            else:
                pageNext = False
def get_main_info(content):
    # 使用py query解析文本
    jq = pQuery(content)

    # 各元素信息提取
    row_num = 0
    tr = jq('tr')
    for i_tr in tr:
        td = pQuery(i_tr)
        arr = td('td')
        if len(arr) >= 3:
            row_num += 1
            chemical_name = arr('a').eq(0)
            chinese_name = arr('a').eq(1)
            cas = arr('a').eq(2)
            href = str(pQuery(chemical_name).attr['href'])
            element_num = re.findall(r'\d+', href)

            # Element Info
            # print 'Element Num: ' + element_num[0]
            # print 'Chinese Name: ' + pQuery(chinese_name).text()
            # print 'Chemical Name: ' + pQuery(chemical_name).text()
            # print 'CAS: ' + pQuery(cas).text()
            # print 'MF: ' + pQuery(arr('span')).text()

            # Provider Info
            provider_url = get_provider_page_uri(element_num[0], 0)
            provider_content = get_content(provider_url, running_state)
            provider_info = get_provider_info(provider_content, element_num[0])
            # print 'total_provider_num: ' + str(provider_info[0])
            # print 'total_golden_product: ' + str(provider_info[1])
            line_count = row_num
            print 'Line Count: ' + str(line_count)
            ws.write(line_count, 0, pQuery(cas).text())
            ws.write(line_count, 1, pQuery(chinese_name).text())
            ws.write(line_count, 2, pQuery(chemical_name).text())
            ws.write(line_count, 3, pQuery(arr('span')).text())
            ws.write(line_count, 4, int(provider_info[0]))
            ws.write(line_count, 5, int(provider_info[1]))
            line_count += 1            
예제 #10
0
 def getQuey(self, url):
     html = self.get(url)
     return pQuery(html)