def get_page_properties(html, need_html=False, need_data=False, properties=None, **kwargs): d = PyQuery(html) d('script').remove() th = d( "div[data-macro-name=details] > div.table-wrap > table > tbody > tr > th" ) for elem in th: _th = PyQuery(elem) key = _th.text().strip() if properties is not None: if key not in properties: continue if not need_data and need_html: yield (key, _th.next().html()) else: yield (key, extract_data(_th.next(), need_data=False))
def parsea_data(raw): d = PyQuery(raw) elems = d('div.mb4.w-100.w-25-l.w-50-m') for el in elems.items(): titleEl = PyQuery(el.find("h3.mt3.mb0.b")) imgEl = PyQuery(el.find("div[data-bg-src]")) groupEl = PyQuery(el.find("span.f7")) userEl = PyQuery(el.find("span.f7>a")) # title title = titleEl.text() # img image = imgEl.attr("data-bg-src") # get group byIdx = groupEl.text().index(" By ") group = groupEl.text()[:byIdx] # user user_name = userEl.text() user_link = userEl.attr("href") # type icon = PyQuery(el.find("i.material-icons.v-mid")) print(icon.next()) if icon.text() == 'timelapse': type = 'sales' else: type = 'subscription' info = icon.parents('span').text() info = info.replace('timelapse', '') info = info.replace('all_inclusive', '') # price priceEl = PyQuery( el.find( "div.w-100.absolute.bottom-0.mb3.black>div:nth-of-type(3)")) price = priceEl.text() list.append({ 'title': title, 'image': image, 'group': group, 'user': { 'name': user_name, 'link': user_link }, 'type': type, 'info': info, 'price': price })
pq = PyQuery(table) list = pq.find('tbody tr') jobs = [] for x in list: job = {} trDom = PyQuery(x) info = map(lambda x: PyQuery(x).text(), trDom.find("td")) if trDom.attr('style'): continue link=trDom('td:first a').attr('href') m=re.search(r'positionId=(?P<id>\d+)',link) id=m.group('id') job['id']=id job['title'] = info[0] job['type'] = info[1] job['dest'] = info[2] job['num'] = info[3] job['date'] = info[4] job['postion_desc'] = PyQuery(PyQuery(trDom.next("tr")).find(".pt-20:first")).text() job['postion_require'] = PyQuery(PyQuery(trDom.next("tr")).find(".pt-20:last")).text() jobs.append(job) for job in jobs: print job
def search(self, word): response = requests.get(self.URL.format(word=word), headers=headers) text = response.text # たまにhtmlに「𥝱」があって、処理はエラーが発生する text = text.replace('𥝱', '') doc = PyQuery(text) results = [] normal_dict = doc("div.NetDicHead") if normal_dict: for head in normal_dict: result = {'word': word, 'type': 'normal'} head = PyQuery(head) # 括弧(【】)がある場合、漢字か外来語は入ってる match_kakko = re.compile(r"【(.*)】").search(head.text()) if match_kakko: kakko = match_kakko.group(1) match_gairaigo = re.compile(r"[a-zA-Z]").search(kakko) if match_gairaigo: result['gogen'] = kakko result['kana'] = word else: result['kanji'] = kakko result['kana'] = head('b').text().replace(' ', '').replace( '・', '') for accent in head('span'): accent = PyQuery(accent) match_accent = re.compile(r"[([0-9]*)]").search( accent.text()) if match_accent: result['accent'] = result.get( 'accent', '') + match_accent.group(1) + ',' if 'accent' in result: result['accent'] = result['accent'][:-1] body = head.next() for a in body('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = body.html() # 単語自体は仮名のみの場合 if 'kana' not in result: result['kana'] = word results.append(result) Jitsu_dict = doc("div.Jtnhj") if Jitsu_dict: result = {'word': word, 'type': 'Jitsu'} match = re.compile( r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?><!--AVOID_CROSSLINK-->別表記" ).search(Jitsu_dict.html()) if match: result['kana'] = match.group(1) if result['kana'].find('<a') != -1: result['kana'] = PyQuery(result['kana']).text() else: match = re.compile( r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?>").search( Jitsu_dict.html()) if match: result['kana'] = match.group(1) if result['kana'].find('<a') != -1: result['kana'] = PyQuery(result['kana']).text() if Jitsu_dict('.AM'): meaning = PyQuery('<div>') meaning.html(Jitsu_dict('.AM').nextAll()) else: meaning = Jitsu_dict for a in meaning('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = meaning.text() results.append(result) IT_dict = doc('div.Binit') if IT_dict: result = {'word': word, 'type': 'IT'} a = IT_dict('a').eq(0) if a.text().find('読み方') != -1: kana_tag = a.next('a').eq(0) result['kana'] = kana_tag.text().replace(' ', "") else: result['kana'] = word if IT_dict.text().find('【') != -1: result['gogen'] = a.eq(0).text() for p in IT_dict('p'): p = PyQuery(p) for a in p('a'): a = PyQuery(a) a.replaceWith(a.html()) if not p.html(): continue result['meaning'] = result.get('meaning', '') + "<p>" + p.html() + "</p>" result['kanji'] = IT_dict.prev("h2.midashigo").text() results.append(result) WIKI = doc('div.Wkpja') if WIKI: result = {'word': word, 'type': 'WIKI'} p = WIKI('p').not_(".WkpjaTs") for a in p('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = p.html() result['kanji'] = WIKI.prev("h2.midashigo").text() results.append(result) if results: return {"status": 'success', "results": results} else: return {"status": 'error', "error_detail": "Nothing found."}