예제 #1
0
def get_page_properties(html,
                        need_html=False,
                        need_data=False,
                        properties=None,
                        **kwargs):
    d = PyQuery(html)
    d('script').remove()
    th = d(
        "div[data-macro-name=details] > div.table-wrap > table > tbody > tr > th"
    )
    for elem in th:
        _th = PyQuery(elem)
        key = _th.text().strip()

        if properties is not None:
            if key not in properties:
                continue

        if not need_data and need_html:
            yield (key, _th.next().html())
        else:
            yield (key, extract_data(_th.next(), need_data=False))
예제 #2
0
def parsea_data(raw):
    d = PyQuery(raw)
    elems = d('div.mb4.w-100.w-25-l.w-50-m')
    for el in elems.items():
        titleEl = PyQuery(el.find("h3.mt3.mb0.b"))
        imgEl = PyQuery(el.find("div[data-bg-src]"))
        groupEl = PyQuery(el.find("span.f7"))
        userEl = PyQuery(el.find("span.f7>a"))
        # title
        title = titleEl.text()
        # img
        image = imgEl.attr("data-bg-src")
        # get group
        byIdx = groupEl.text().index(" By ")
        group = groupEl.text()[:byIdx]
        # user
        user_name = userEl.text()
        user_link = userEl.attr("href")
        # type
        icon = PyQuery(el.find("i.material-icons.v-mid"))
        print(icon.next())
        if icon.text() == 'timelapse':
            type = 'sales'
        else:
            type = 'subscription'

        info = icon.parents('span').text()
        info = info.replace('timelapse', '')
        info = info.replace('all_inclusive', '')

        # price
        priceEl = PyQuery(
            el.find(
                "div.w-100.absolute.bottom-0.mb3.black>div:nth-of-type(3)"))
        price = priceEl.text()
        list.append({
            'title': title,
            'image': image,
            'group': group,
            'user': {
                'name': user_name,
                'link': user_link
            },
            'type': type,
            'info': info,
            'price': price
        })
예제 #3
0
파일: spider.py 프로젝트: stonenice/bee
pq = PyQuery(table)

list = pq.find('tbody tr')

jobs = []

for x in list:
    job = {}
    trDom = PyQuery(x)
    info = map(lambda x: PyQuery(x).text(), trDom.find("td"))

    if trDom.attr('style'):
        continue

    link=trDom('td:first a').attr('href')
    m=re.search(r'positionId=(?P<id>\d+)',link)
    id=m.group('id')
    job['id']=id
    job['title'] = info[0]
    job['type'] = info[1]
    job['dest'] = info[2]
    job['num'] = info[3]
    job['date'] = info[4]

    job['postion_desc'] = PyQuery(PyQuery(trDom.next("tr")).find(".pt-20:first")).text()
    job['postion_require'] = PyQuery(PyQuery(trDom.next("tr")).find(".pt-20:last")).text()
    jobs.append(job)

for job in jobs:
    print job
예제 #4
0
파일: weblio.py 프로젝트: hrdrq/dictionary
    def search(self, word):
        response = requests.get(self.URL.format(word=word), headers=headers)
        text = response.text
        # たまにhtmlに「𥝱」があって、処理はエラーが発生する
        text = text.replace('𥝱', '')

        doc = PyQuery(text)
        results = []
        normal_dict = doc("div.NetDicHead")
        if normal_dict:
            for head in normal_dict:
                result = {'word': word, 'type': 'normal'}
                head = PyQuery(head)
                # 括弧(【】)がある場合、漢字か外来語は入ってる
                match_kakko = re.compile(r"【(.*)】").search(head.text())
                if match_kakko:
                    kakko = match_kakko.group(1)
                    match_gairaigo = re.compile(r"[a-zA-Z]").search(kakko)
                    if match_gairaigo:
                        result['gogen'] = kakko
                        result['kana'] = word
                    else:
                        result['kanji'] = kakko
                        result['kana'] = head('b').text().replace(' ',
                                                                  '').replace(
                                                                      '・', '')
                for accent in head('span'):
                    accent = PyQuery(accent)
                    match_accent = re.compile(r"[([0-9]*)]").search(
                        accent.text())
                    if match_accent:
                        result['accent'] = result.get(
                            'accent', '') + match_accent.group(1) + ','
                if 'accent' in result:
                    result['accent'] = result['accent'][:-1]
                body = head.next()
                for a in body('a'):
                    a = PyQuery(a)
                    a.replaceWith(a.html())
                result['meaning'] = body.html()
                # 単語自体は仮名のみの場合
                if 'kana' not in result:
                    result['kana'] = word
                results.append(result)

        Jitsu_dict = doc("div.Jtnhj")
        if Jitsu_dict:
            result = {'word': word, 'type': 'Jitsu'}
            match = re.compile(
                r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?><!--AVOID_CROSSLINK-->別表記"
            ).search(Jitsu_dict.html())
            if match:
                result['kana'] = match.group(1)
                if result['kana'].find('<a') != -1:
                    result['kana'] = PyQuery(result['kana']).text()
            else:
                match = re.compile(
                    r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?>").search(
                        Jitsu_dict.html())
                if match:
                    result['kana'] = match.group(1)
                    if result['kana'].find('<a') != -1:
                        result['kana'] = PyQuery(result['kana']).text()

            if Jitsu_dict('.AM'):
                meaning = PyQuery('<div>')
                meaning.html(Jitsu_dict('.AM').nextAll())
            else:
                meaning = Jitsu_dict
            for a in meaning('a'):
                a = PyQuery(a)
                a.replaceWith(a.html())
            result['meaning'] = meaning.text()
            results.append(result)

        IT_dict = doc('div.Binit')
        if IT_dict:
            result = {'word': word, 'type': 'IT'}
            a = IT_dict('a').eq(0)
            if a.text().find('読み方') != -1:
                kana_tag = a.next('a').eq(0)
                result['kana'] = kana_tag.text().replace(' ', "")
            else:
                result['kana'] = word
                if IT_dict.text().find('【') != -1:
                    result['gogen'] = a.eq(0).text()
            for p in IT_dict('p'):
                p = PyQuery(p)
                for a in p('a'):
                    a = PyQuery(a)
                    a.replaceWith(a.html())
                if not p.html():
                    continue
                result['meaning'] = result.get('meaning',
                                               '') + "<p>" + p.html() + "</p>"
            result['kanji'] = IT_dict.prev("h2.midashigo").text()
            results.append(result)

        WIKI = doc('div.Wkpja')
        if WIKI:
            result = {'word': word, 'type': 'WIKI'}
            p = WIKI('p').not_(".WkpjaTs")
            for a in p('a'):
                a = PyQuery(a)
                a.replaceWith(a.html())
            result['meaning'] = p.html()
            result['kanji'] = WIKI.prev("h2.midashigo").text()
            results.append(result)
        if results:
            return {"status": 'success', "results": results}
        else:
            return {"status": 'error', "error_detail": "Nothing found."}