def get_url(url): response = requests.get(url) doc = PyQuery(response.text) for article in doc('article'): h = PyQuery(article) print h.find('h1.entry-title').text().encode('utf-8') print h.find('div.entry-content p').text().encode('utf-8')
def test_calendar_tag_rendering(self, timezone_mock): timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12) page_with_apphook = self.create_base_pages() other_config = EventsConfig.objects.create(namespace='other') self.create_event( title='ev1', start_date=tz_datetime(2015, 1, 13), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( title='ev2', start_date=tz_datetime(2015, 1, 15), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( de=dict( title='ev3', start_date=tz_datetime(2015, 1, 16), publish_at=tz_datetime(2015, 1, 10) ) ) self.create_event( title='ev4', start_date=tz_datetime(2015, 1, 18), publish_at=tz_datetime(2015, 1, 10), app_config=other_config ) self.create_event( title='ev5', start_date=tz_datetime(2015, 1, 22), end_date=tz_datetime(2015, 1, 27), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( title='ev6', start_date=tz_datetime(2015, 1, 25), ) # make use of default tests self.app_config namespace, instead of # hard coding it template_str = """ {%% load aldryn_events %%} {%% calendar 2015 1 'en' '%s' %%} """ % self.app_config.namespace t = Template(template_str) with override('en'): html = t.render(SekizaiContext({})) table = PyQuery(html)('table.table-calendar') page_url_en = page_with_apphook.get_absolute_url() links = table.find('td.events, td.multiday-events').find('a') # test if tag rendered important elements self.assertEqual('1', table.attr('data-month-numeric'), ) self.assertEqual('2015', table.attr('data-year')) self.assertEqual('10', table.find('td.today').text()) self.assertEqual(8, links.length) # 13, 15, 22, 23, 24, 25, 26, 27 expected_days = (13, 15, 22, 23, 24, 25, 26, 27) for position, day in enumerate(expected_days): event_url = '{0}2015/1/{1}/'.format(page_url_en, day) rendered_url = links[position].attrib['href'] self.assertEqual(event_url, rendered_url)
def get_phonetic_symbol(text): data = { "keyfrom" : "deskdict.mini", "q" : text, "doctype" : "xml", "xmlVersion" : 8.2, "client" : "deskdict", "id" : "cee84504d9984f1b2", "vendor": "unknown", "in" : "YoudaoDict", "appVer" : "5.4.46.5554", "appZengqiang" : 0, "le" : "eng", "LTH" : 40} ret = requests.get("http://dict.youdao.com/search", params=data).text if isinstance(ret, unicode): ret = ret.encode('utf-8') pq = PyQuery(ret, parser="xml") phonetic_symbol = pq.find('usphone').text() phonetic_type = _("US") try: if phonetic_symbol == '': phonetic_symbol = pq.find('ukphone').text() phonetic_type = _("UK") except: pass if phonetic_symbol == '' or phonetic_symbol == None or phonetic_symbol.isspace(): return "" else: if isinstance(phonetic_type, unicode): phonetic_type = phonetic_type.encode('utf-8') if isinstance(phonetic_symbol, unicode): phonetic_symbol = phonetic_symbol.encode('utf-8') return "[%s] %s" % (phonetic_type, phonetic_symbol)
def get_old_fashion_comments(answer_url): aid = comment_list_id(answer_url) comment_box_link = 'http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22{}%22%2C%22load_all%22%3Atrue%7D'.format(aid) # | log # log('comments: ' + comment_box_link) r = old_client._session.get(comment_box_link) # print(str(r.content)) doc = PyQuery(str(r.content, encoding='utf-8')) comments = [] for div in doc.find('div.zm-item-comment'): div = PyQuery(div) cid = div.attr('data-id') vote_count = int(div.find('span.like-num').find('em').text()) content = div.find('div.zm-comment-content').html() author_text = div.find('div.zm-comment-hd').text().replace('\n', ' ') if ' 回复 ' in author_text: author, reply_to = author_text.split(' 回复 ') else: author, reply_to = author_text, None comment = OldFashionComment(cid=cid, vote_count=vote_count, content=content, author=OldFashionAuthor(author), reply_to=OldFashionAuthor(reply_to) if reply_to else None) comments.append(comment) return comments
def update_forums(client, group, session): logging.info("Updating forums list for {}".format(group)) query = Forum.get_forum_page(client, group.gid) reg = regex.compile(r"^forum\.php\?mod=forumdisplay&fid=(\d+)$") for row in query.find("table.fl_tb>tr"): sub_query = PyQuery(row) href = sub_query.find("td").eq(1).find("a").attr("href") if not href: continue fid = int(reg.findall(href)[0]) name = sub_query.find("td").eq(1).find("h2>a").clone().children().remove().end().text() last_update = sub_query.find("td").eq(3).find("div>cite").clone().children().remove().end().text() last_update = dateparse(last_update) existence = session.query(Forum).filter(Forum.fid == fid) if existence.count() == 0: logging.info("<Forum(fid={})> not found, creating one".format(fid)) forum = Forum(fid=fid, name=name, updated_at=last_update, group=group, fresh=False) session.add(forum) else: forum = existence.one() if forum.updated_at != last_update: logging.info("{} found, stale: against {} ".format(forum, last_update)) forum.updated_at = last_update forum.fresh = False session.add(forum) else: logging.info("{} found, fresh".format(forum))
def _parse_table(self, table): # Initialize table parsed_rows = [] # Parse table qtable = PyQuery(table) # Get headers headers = self._get_headers(qtable) if not headers: return # Get rows rows = qtable.find("tr") # Loop over rows for row in rows: # Get columns qrow = PyQuery(row) cols = qrow.find("td").map(self._get_text)[:] # Parse column values for colidx in range(len(cols)): col = reduce(lambda x, y: re.sub(y[0], y[1], x), self._trans, cols[colidx]) cols[colidx] = col # Append parsed columns if cols: parsed_rows.append(cols) return {"headers": headers, "data": parsed_rows}
def process_chapter(chapter_path, index_path, enable_stem) : # removed characters remove_chars = ['.',',',';','?','!','-',u'–',u'―',u'—',u'~',':','"',')','(','[',']','/','\\',"'s",u'’s',"'",u'‘',u'’',u'“',u'”', u'¿', '*', '<','>','&','{','}'] restricted_words = ['a', 'and', 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'but', 'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over', 'past', 'since', 'the', 'through', 'throughout', 'till', 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without'] # create jQuery object html = open(chapter_path, 'r').read() jquery = PyQuery(html) print jquery.find('.chapter').attr('data-osis') # find all verses, remove all notes and verse numbers verses = jquery('span.verse') verses.find('span.note').remove() verses.find('span.cf').remove() verses.find('.v-num').remove() for verse in verses: v = PyQuery(verse) osis = v.attr('data-osis') text = v.text() # remove punctuation for s in remove_chars: text = text.replace(s, '') words = text.split(' ') for word in words: word = word.strip().lower() #there's got to be a cleaner way to do this in Python is_restricted = True try: restricted_words.index(word) except: is_restricted = False if word != '' and not is_restricted and not word.isnumeric(): # stemmer? if enable_stem: word = stem(word) word_path = index_path + word + '.json' # check for file if os.path.exists(word_path): f = open(word_path,'a') f.write(',"' + osis + '"') f.close() else: f = open(word_path,'a') f.write('["' + osis + '"') f.close()
def test_device_elements(self): response = self.client.get(self.url) self.assertEqual(response.status_code, 200) tree = PyQuery(response.content) radios = tree.find('input[type="radio"]') self.assertEqual(len(radios), 2) tree = PyQuery(response.content) checkboxes = tree.find('input[type="checkbox"]') self.assertEqual(len(checkboxes), 1)
def getFullInfo(self, interval=0.5): self.getBasicInfo() # collCount if self.INFO['shopType'] in ['7', '1', '3', '4']: regxrs = re.findall( r'J_SCollCount\"\s+data\-info\=\"param\=(.+?)\&', self.content) if regxrs: params = {'keys': regxrs[0], 't': '%.0f' % (time.time() * 1000), 'callback': 'TShop.setShopStat'} domain = 'http://count.tbcdn.cn/counter3' tUrl = domain + '?' + '&'.join([k + '=' + v for (k, v) in params.items()]) r = request(tUrl) self.INFO['collCount'] = str(re.findall(r'\"\S+\"\:(\d+)', r.text, re.S)[0]) else: if PyQuery: pyjq_obj = PyQuery(self.content.decode(self.res.encoding)) data_info = (pyjq_obj.find('dl.collect-num dt') or pyjq_obj.find('.collect-num span')).attr('data-info') count_url = re.sub( r'param=(?P<param>.+?)\&countUrl=(?P<count>.+?)\&.+', r'\g<count>?callback=jsonp357&t=%d&keys=\g<param>' % (time.time()*1000), data_info, re.S) collCount = re.sub( r'.+\:\s*(?P<coll>\d+).+', r'\g<coll>', request(count_url).content) if collCount: self.INFO['collCount'] = int(collCount) else: coll_url = 'http://favorite.taobao.com/collect_item_relation.htm?itemtype=0&itemNumid=%s' % self.INFO['shopId'] try: res = request(coll_url) pyjq_obj = PyQuery(res.content.decode(res.encoding)) self.INFO['collCount'] = pyjq_obj.find("div.add-fav-msg strong").html().strip() except: pass time.sleep(interval) # itemAmount if self.INFO['shopType'] == '2': tUrl = self.INFO['shopLink'] + '?search=y&orderType=_hotsell' r = request(tUrl) try: self.INFO['itemAmount'] = \ str(re.findall(r'\<div\s+class\=\"search\-result\"\s*\>.+?(\d+)', r.text, re.S)[0]) except: pass time.sleep(interval) return self.INFO
def test_initial_post(self): mock_request = self._get_post_request(type='generic') response, MockDevice = self._post_device_addition(mock_request) self.assertEqual(response.status_code, 200) self.assertFalse(MockDevice.objects.create.called) tree = PyQuery(response.content) self.assertEqual(len(tree.find('.error')), 0) device_name = tree.find('input[type=text][name=name]') self.assertEqual(len(device_name), 1) self.assertEqual(device_name.val(), 'Authentication device')
def parse(self, response): html = Pq(response.body) job = items.OnetJob() job['url'] = response.url job['alt_title'] = html.find('[class="titleb"]').text() job['job_sample'] = html.find( 'p:contains("Sample of reported job titles:")').text() job['summary'] = html.find( '#realcontent').find('p:eq(0)').text() job['job_sample'] = job['job_sample'].replace( 'Sample of reported job titles:', '').split(', ') job['tasks'] = self._list(html, '.section_Tasks .moreinfo') job['tools'] = self._list( html, '.section_ToolsTechnology .moreinfo:first') job['technology'] = self._list( html, '.section_ToolsTechnology .moreinfo:last') job['knowledge'] = self._list(html, '.section_Knowledge .moreinfo') job['skills'] = self._list(html, '.section_Skills .moreinfo') job['abilities'] = self._list(html, '.section_Abilities .moreinfo') job['work_activities'] = { 'basic': self._list(html, '.section_WorkActivities .moreinfo'), 'detailed': self._list( html, '.section_DetailedWorkActivities .moreinfo'), } job['work_context'] = self._list( html, '.section_WorkContext .moreinfo') job['job_zone'] = self._table(html, '#content table:first') job['education'] = self._table(html, '#content table:eq(1)') job['interests'] = self._list(html, None, custom=html.find( '[name="Interests"]').siblings('.moreinfo:first')) job['work_styles'] = self._list( html, '.section_WorkStyles .moreinfo') job['interests'] = self._list(html, None, custom=html.find( '[name="WorkValues"]').siblings('.moreinfo:eq(1)')) job['related_occupations'] = self._table( html, '.section_RelatedOccupations table') job['wages_employment'] = self._table( html, '[summary="Wages & Employment Trends information' ' for this occupation"]') job['job_openings'] = '' job['additional_info'] = '' return job
def parse(self, response): category = items.ONetCategory() html = Pq(response.body) category['url'] = response.url category['name'] = html.find('.reportdesc:eq(0)').text().replace( 'Save Table ( XLS / CSV )', '') category['id'] = response.url.replace('{}?i'.format( self.root_url), '').replace( '&g=Go', '').replace('=', '').replace('.', '') category['bls_url'] = html.find( 'div.reportdesc a:first').attr('href') category['occupation_data'] = self._extract_occupations(html) return category
def _fetch_mdn_page(url): data = bleach.clean(_get_page(url), attributes=ALLOWED_ATTRIBUTES, tags=ALLOWED_TAGS, strip_comments=False) root = PyQuery(data) toc = root.find('#article-nav div.page-toc ol')[0] content = root.find('#pageText')[0] toc.set('id', 'mdn-toc') content.set('id', 'mdn-content') return (etree.tostring(toc, pretty_print=True), etree.tostring(content, pretty_print=True))
def feed(self, data, sentence): d = PyQuery(data) sets = d(".sentences_set") for s in sets: s = PyQuery(s) if s.find(".mainSentence .sentenceContent a").text().strip() == sentence: structure = s.find(".mainSentence .sentenceContent .romanization.furigana").text() translations = s.find(".translations:first") \ .find(".sentence > img[title='English']") \ .parent().find(".sentenceContent > a") \ .map(lambda i, o: o.text) return (structure, translations) return (None, None)
def _get_invoices(self): randomnum = str(int(math.floor((random.random() * 99999) + 1))) response = self.browser.get('https://www.endesaclientes.com/ss/Satellite?c=Page&pagename=SiteEntry_IB_ES%2FBill_Search%2FSearch_List&rand={}'.format(randomnum)) pq = PyQuery(response.content) invoices = [] def getParam(name, rowid): return pq.find('input[id={}_{}]'.format(name, rowid))[0].value for row in pq.find('.invoices_body_row'): invoice = {} row_id = row.attrib['id'].replace('trBill', '') invoice['billNumber'] = getParam('numBill', row_id) invoice['secBill'] = getParam('secBill', row_id) invoice['contractNumber'] = getParam('contractNumber', row_id) invoice['holderCompanyCode'] = getParam('holderCompanyCode', row_id) invoice['businessLine'] = getParam('businessLine', row_id) invoice['numscct'] = '' invoice['refBill'] = getParam('refBill', row_id) date = pq(row).find('td')[3].text.strip() invoice['date'] = datetime.datetime.strptime(date, '%a %b %d %H:%M:%S %Z %Y').strftime('%Y-%m-%d') if not self._invoice_exists(invoice): invoice['method'] = 'get' invoice['url'] = self._pdf_download_url(invoice) invoice['name'] = self._invoice_name(invoice) invoices.append(invoice) return invoices
def station_parse(content): '''Parsing bus station and check station. ''' OFF = '0-0' stations = [] bus_status = [] content = json.loads(content[3:].decode('utf-8')) status = content['status'] info = content['info'] if status == 1 and info != '': pqContent = PyQuery(info)('#upInfo li') for station in pqContent: pqStation = PyQuery(station) station_name = pqStation('.station').text() stations.append(station_name) buses = pqStation.find('.bus') if buses.size() > 0: left_count = 0 on_station_count = 0 for bus in buses: if PyQuery(bus).attr('style'): left_count+=1 else: on_station_count+=1 bus_status.append('{0}-{1}'.format(on_station_count, left_count)) else: bus_status.append(OFF) if not stations: return None return (tuple(bus_status), tuple(stations))
def fetch_urls(self, queue, quantity): while not queue.empty(): url = queue.get() html = self.s.get(url, headers=self.headers).text pq = PyQuery(html) size = pq.find('tbody tr').size() for index in range(size): item = pq.find('tbody tr').eq(index) ip = item.find('td').eq(0).text() port = item.find('td').eq(1).text() _type = item.find('td').eq(3).text() self.result_arr.append({ str(_type).lower(): '{0}://{1}:{2}'.format(str(_type).lower(), ip, port) }) if len(self.result_arr) >= quantity: break
def get_meme_url(meme): gen = GENERATORS.get(meme) if gen: pq = PyQuery(url="http://memegenerator.net/%s" % gen[2]) return pq.find('a img.large').attr('src') else: return None
def getPageLinkIfValid(element, currentPageNumber): pyElement = PyQuery(element) pageNumberText = pyElement.find('span').text() if pageNumberText.isdigit() and int(pageNumberText) > currentPageNumber: return 'https://www.youtube.com' + pyElement.attr('href') return None
def scrape_growler_guys(location): taps = {} if LOCAL_TEST: page = PQ(filename='test.html') else: page = PQ(location['url'], headers=HEADERS) beer_list = page('.tap-list li') for item in beer_list: beer_obj = PQ(item) tap_number = beer_obj.find('.tap_number').text().strip() beer = build_beer_record( location = location['name'], name = beer_obj('.beerName .title').text().strip().encode('utf-8'), style = beer_obj('.beerName .style').text().strip().strip('- ').lower().encode('utf-8'), brewery = beer_obj('.brewery').text().strip().encode('utf-8'), city = beer_obj('.breweryInfo .txt').text().strip().strip('- ').replace(' ,',',').encode('utf-8'), ) # make a hash value for the key h = hashlib.md5(b'{0} {1}'.format(beer['location'], tap_number)) beer_key = h.hexdigest() taps[beer_key] = beer return taps
def html_to_records(html): pq = PyQuery(html) rows = pq.find('table tr') get_row = lambda r: map(lambda th: th.text, r) headers = get_row(rows[0]) for row in rows[1:]: yield dict(zip(headers, get_row(row)))
def get_forums(): logging.info('Récupération des forums') if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()]) d = PyQuery(url=config.rooturl + '/a-f1/', opener=fa_opener) save.forums = [] levels = {} n = 1 for i in progress([i for i in d.find("select option") if i.get("value", "-1") != "-1"]): id = i.get("value", "-1") logging.debug('Récupération: forum %s', id) title = re.search('(((\||\xa0)(\xa0\xa0\xa0))*)\|--([^<]+)', i.text).group(5) level = len(re.findall('(\||\xa0)\xa0\xa0\xa0', i.text)) if level <= 0: parent = 0 else: parent = levels[level-1] levels[level] = n d = PyQuery(url=config.rooturl+'/admin/index.forum?part=general&sub=general&mode=edit&fid=' + id + '&extended_admin=1&' + tid, opener=fa_opener) try: description = d("textarea").text() except: description = "" save.forums.append({'id': int(id[1:]), 'newid': n, 'type': id[0], 'parent': parent, 'title': title, 'description': description, 'parsed': False}) n += 1
class Shows: def __init__(self, crawler): self.crawler = crawler self.categories = PyQuery(self.crawler.url) self.categories_iter = self.categories.find("li.svtoa-anchor-list-item a") self.i = 0 def __iter__(self): return self def next(self): if self.i == self.categories_iter.length: raise StopIteration link = self.categories_iter[self.i] py_link = PyQuery(link) href = py_link.attr('href') html_class = href.split('/')[-1:][0] title = py_link.text() # thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src') url = href show = Show(title, url, html_class) show.clips = Episodes(self.crawler, url) self.i += 1 return show
def test_robots_are_inactive(self): page_extension = MetaTagPageExtension(extended_object=self.page, robots_indexing=False, robots_following=False) page_extension.save() self.page.publish('en') response = self.client.get('/') content = PyQuery(response.content) self.assertEqual(content.find('meta[name="robots"]').attr('content'), 'noindex, nofollow')
def get_saml_response(response): tree = PyQuery(response.content) inputtag = tree.find('input[name="SAMLResponse"]') assert len(inputtag) == 1 encoded_response = inputtag[0].get('value') samlresponse = base64.b64decode(encoded_response) return samlresponse
def get_img_urls(content): if not content: return [] url_list = [] doc = PyQuery(content) nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img') for node in nodeList: url = PyQuery(node).attr('src') if not url: continue if url.find('60x60') > 0: url=url.replace('60x60','400x400') url_list.append(url) needDescImg = True if needDescImg: link_url = doc('div#desc-lazyload-container').attr('data-tfs-url') if not link_url: return url_list desc_content = fetchPageWithUrl(link_url) #懒惰匹配模式 imgNodes = re.findall('<img[^<>]*>.*?', desc_content) #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content) for node in imgNodes: nodeQ = PyQuery(node) desc_url = nodeQ('img').attr('src') if desc_url: desc_url = desc_url.replace('\\"', '') if not desc_url: continue if 'gif' in desc_url: #gif图片不要 continue #if '//gd' in desc_url or '/2015/' in desc_url: url_list.append(desc_url) return url_list
def _export_(self): self.logger.debug("Récupération des messages du sujet %d (page %d)", self.topic.topic_id, self.page) response = self.session.get("/t{}p{}-a".format(self.topic.topic_id, self.page)) document = PyQuery(response.text) pattern = re.compile(r"/u(\d+)") for element in document.find("tr.post"): e = PyQuery(element) post_id = int(e("td span.name a").attr("name")) self.logger.info("Récupération du message %d (sujet %d)", post_id, self.topic.topic_id) match = pattern.fullmatch(clean_url(e("td span.name strong a").eq(0).attr("href") or "")) if match: poster = self.users[int(match.group(1))] else: poster = AnonymousUser() post = e("td div.postbody div").eq(0).html() if not post: self.logger.warning("Le message %d (sujet %d) semble être vide", post_id, self.topic.topic_id) post = "" # Get title title = e("table td span.postdetails").contents()[1] # Remove "Sujet :" before the title and spaces at the end title = title[7:].rstrip() # Get the date and time of the post timestamp = parse_date(e("table td span.postdetails").contents()[3]) self.add_child(Post(post_id, post, title, timestamp, poster))
def parseProductsAndCategoriesByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productList, categoryList = [], [] if category_info.parent_categories and len(category_info.parent_categories) == 2: productList = self.parseProductsByCategory(category_page_content, category_info) return productList, categoryList if category_info.name == 'New Arrivals': #特殊处理一下 for level2Node in doc.find('div#js_catelist_sec > div.item'): level2NodeQ = PyQuery(level2Node) level2CateName = level2NodeQ.children('p > a').text() for level3Node in level2NodeQ.children('ul > li > a'): categoryInfo = self.createCategory(PyQuery(level3Node)) categoryInfo.parent_categories = [category_info.name, level2CateName] categoryList.append(categoryInfo.formalize()) return productList, categoryList if category_info.name == 'Clearance': level2NodeList = doc('div.catelist > ul.cataUl_list > li > a') for level2Node in level2NodeList: categoryInfo = self.createCategory(PyQuery(level2Node)) categoryInfo.parent_categories = ['Clearance'] categoryList.append(categoryInfo.formalize()) return productList, categoryList if doc.find('div#js_catelist_sec > div.cur > ul > li'): nodeList = doc.find('div#js_catelist_sec > div.cur > ul > li > a') for node in nodeList: nodeQ = PyQuery(node) categoryInfo = self.newCategory() categoryInfo.name = nodeQ.text() categoryInfo.url = nodeQ.attr('href') categoryInfo.set_categories(category_info) categoryList.append(categoryInfo.formalize()) elif doc.find('div.catelist > ul > li.cur > div.menuList > p'): nodeList = doc.find('div.catelist > ul > li.cur > div.menuList > p > a') for node in nodeList: nodeQ = PyQuery(node) categoryInfo = self.newCategory() categoryInfo.name = nodeQ.text() categoryInfo.url = nodeQ.attr('href') if category_info.parent_categories: result = category_info.parent_categories + [category_info.name] else: result = [category_info.name] categoryInfo.parent_categories = result categoryList.append(categoryInfo.formalize()) else: productList = self.parseProductsByCategory(category_page_content, category_info) return productList, categoryList
def create_meme(title, lines): url = "%s/%s" % (GENURL, title) pq = PyQuery(url=url) form = pq.find('div.instance_form_create_small form') if len(form) == 0: return "Error: something changed or something weird happened." else: url = "%s%s" % (GENURL, form[0].attrib['action']) data = { 'languageCode': 'en', 'generatorID': form.find('#generatorID').val(), 'imageID': form.find('#imageID').val(), 'text0': lines[0], 'text1': len(lines) > 1 and lines[1] or '', } postq = PyQuery(url=url, data=data, method='post') return GENURL + postq.find('div.instance_large img')[0].attrib['src']
def test_hreflang_basic(base_url): """Ensure that we're specifying the correct value for lang and hreflang.""" url = base_url + '/en-US/docs/Web/HTTP' resp = requests.get(url) assert resp.status_code == 200 html = PyQuery(resp.text) assert html.attr('lang') == 'en' assert html.find('head > link[hreflang="en"][href="{}"]'.format(url))
def getFullInfo(self, interval=0.5): self.getBasicInfo() # collCount if self.INFO['shopType'] in ['7', '1', '3', '4']: regxrs = re.findall( r'J_SCollCount\"\s+data\-info\=\"param\=(.+?)\&', self.content) if regxrs: params = { 'keys': regxrs[0], 't': '%.0f' % (time.time() * 1000), 'callback': 'TShop.setShopStat' } domain = 'http://count.tbcdn.cn/counter3' tUrl = domain + '?' + '&'.join( [k + '=' + v for (k, v) in params.items()]) r = request(tUrl) self.INFO['collCount'] = str( re.findall(r'\"\S+\"\:(\d+)', r.text, re.S)[0]) else: if PyQuery: pyjq_obj = PyQuery(self.content.decode(self.res.encoding)) data_info = (pyjq_obj.find('dl.collect-num dt') or pyjq_obj.find('.collect-num span') ).attr('data-info') if data_info: count_url = re.sub( r'param=(?P<param>.+?)\&countUrl=(?P<count>.+?)\&.+', r'\g<count>?callback=jsonp357&t=%d&keys=\g<param>' % (time.time() * 1000), data_info, re.S) collCount = re.sub(r'.+\:\s*(?P<coll>\d+).+', r'\g<coll>', request(count_url).content) if collCount: self.INFO['collCount'] = int(collCount) else: coll_url = 'http://favorite.taobao.com/collect_item_relation.htm?itemtype=0&itemNumid=%s' % self.INFO[ 'shopId'] try: res = request(coll_url) pyjq_obj = PyQuery(res.content.decode(res.encoding)) self.INFO['collCount'] = pyjq_obj.find( "div.add-fav-msg strong").html().strip() except: pass time.sleep(interval) # itemAmount if self.INFO['shopType'] == '2': tUrl = self.INFO['shopLink'] + '?search=y&orderType=_hotsell' r = request(tUrl) try: self.INFO['itemAmount'] = \ str(re.findall( r'\<div\s+class\=\"search\-result\"\s*\>.+?(\d+)', r.text, re.S)[0]) except: pass time.sleep(interval) self.getRateInfo() return self.INFO
from pyquery import PyQuery as pq db = pymysql.connect('localhost', 'root', '123456', 'zxshop') cursor = db.cursor() url = 'http://www.tcmap.com.cn/list/daima_list.html' def downpage(url) -> '爬取网页数据': r = requests.get( url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' }) r.encoding = 'gbk' return r.text html_doc = downpage(url) pq = pq(html_doc) html = pq.find("#list360") data = html.text() for line in data: print(line[:6], '=', line[6:])
from pyquery import PyQuery html = ''' <div class="wrap"> <div id="container"> <ul class="list"> sdf <a>a</a> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' query = PyQuery(html) print(query.find('a')) print('_______') print(query.children('a'))
def get_copmany_name(text): if text == '' or text is None: return None jq = PyQuery(text, parser='html') company = jq.find('.info-dl').eq(1).find('dd').text() return company.strip()
def _query_selector(pq, args): selector = args.get('selector') if not selector: return pq return pq.find(selector)
def _slice(res: str, index: int = 1) -> GoogleResponse: utf8_parser = HTMLParser(encoding="utf-8") d = PyQuery(fromstring(res, parser=utf8_parser)) data = d.find(".g") pages = list(d.find("td").items())[1:-1] return GoogleResponse(data, pages, index)
def get_posts(): global month logging.info('Récupération des messages') if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=save.nbposts) progress.start() n = len(save.posts) ids = [i["id"] for i in save.posts] for topic in [i for i in save.topics if i["parsed"] == False]: logging.debug('Récupération : messages du topic %d', topic["id"]) subposts = [] subids = [] d = PyQuery(url=config.rooturl + '/t' + str(topic['id']) + '-a', opener=fa_opener) result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text()) try: pages = int(result.group(1)) topicsperpages = int(result.group(2)) except: pages = 1 topicsperpages = 0 for page in range(0,pages): if page >= 1: d = PyQuery(url=config.rooturl + '/a-t' + str(topic['id']) + '-' + str(page*topicsperpages) + '.htm', opener=fa_opener) for i in d.find('tr.post'): e = PyQuery(i) id = int(e("td span.name a").attr("name")) if id not in ids and id not in subids: logging.debug('Récupération : message %d (topic %d)', id, topic["id"]) author = e("td span.name").text() post = htmltobbcode.htmltobbcode(e("td div.postbody div").eq(0).html(), save.smileys) result = e("table td span.postdetails").text().split(" ") if result[-3] == "Aujourd'hui": title = " ".join(e("table td span.postdetails").text().split(" ")[1:-3]) date = e("table td span.postdetails").text().split(" ")[-3:] timestamp = time.mktime(datetime.datetime.combine(datetime.date.today(), datetime.time(int(date[2].split(":")[0]),int(date[2].split(":")[1]))).timetuple()) elif result[-3] == "Hier": title = " ".join(e("table td span.postdetails").text().split(" ")[1:-3]) date = e("table td span.postdetails").text().split(" ")[-3:] timestamp = time.mktime(datetime.datetime.combine(datetime.date.today()-datetime.timedelta(1), datetime.time(int(date[2].split(":")[0]),int(date[2].split(":")[1]))).timetuple()) else: title = " ".join(e("table td span.postdetails").text().split(" ")[1:-6]) date = e("table td span.postdetails").text().split(" ")[-6:] timestamp = time.mktime(datetime.datetime(int(date[3]),month[date[2]],int(date[1]),int(date[5].split(":")[0]),int(date[5].split(":")[1])).timetuple()) subposts.append({'id': id, 'post': post, 'title': title, 'topic': topic["id"], 'timestamp': int(timestamp), 'author': author}) subids.append(id) n += 1 progress.update(n) else: logging.warning('Le message %d a déjà été récupéré.', id) save.posts.extend(subposts) ids.extend(subids) [i for i in save.topics if i == topic][0]["parsed"] = True progress.end()
def wrap(self, invoked_from_wrapper=False): # Handle called from another wrapper. md_section_list = None if isinstance(self.section.contents, list): md_section_list = self.section.contents elif invoked_from_wrapper and \ isinstance(self.section.contents.contents, str): md_section_list = [self.section.contents] if not isinstance(md_section_list, list): raise ValueError('Markdown section does not have valid contents ' + '(must be a list)') for section in md_section_list: # === Start wrappers === if section.type == MD_TYPE_DIV: temp_section = MarkdownSection('markdown', section.contents, {}, {}) invoke(self.cell_object, temp_section) continue if section.type == MD_TYPE_CODE: md_code.invoke(self.cell_object, section) self.cell_object.update_paragraph() continue if section.type == MD_TYPE_QUOTE: md_blockquote.invoke(self.cell_object, section) self.cell_object.update_paragraph() continue if section.type == MD_TYPE_UNORDERED_LIST: md_ul.invoke(self.cell_object, section) self.cell_object.update_paragraph() continue if section.type == MD_TYPE_ORDERED_LIST: md_ol.invoke(self.cell_object, section) self.cell_object.update_paragraph() continue if section.type == MD_TYPE_LIST_ITEM: md_li.invoke(self.cell_object, section) continue if section.type == MD_TYPE_TABLE: table_html = section.extra['original_html'] t = PyQuery(table_html) headers = [i.find('th') for i in t.find('tr').items()][0] headers = [c.text() for c in headers.items()] rows = [ i.find('td') for i in t.find('tr').items() if i.find('td') ] data = [] for row in rows: r = { headers[i]: c.text() for i, c in enumerate(row.items()) } data.append(r) s = Section("table", data, {"tableColumns": headers}, {}) table.invoke(self.cell_object, s) continue # Fix wrapped: # (Some times there are elements which contain other elements, # but are not considered one of the declared wrappers) # They are in MD_ETC_WRAPPERS. if isinstance(section.contents, list) and section.type in MD_ETC_WRAPPERS: is_inside_wrapper = False if 'inline' in section.extra: is_inside_wrapper = True if section.type == 'span': section.propagate_extra('check_newline', True, only_multiple_children=False) # TODO: Fix problem with H1 no newline even if in span. temp_section = MarkdownSection('markdown', section.contents, {}, section.extra, section.attrs) invoke(self.cell_object, temp_section, invoked_from_wrapper=is_inside_wrapper) continue # === Elements === if section.type in SHOULD_NEW_LINE and section.get_extra( 'check_newline'): self.cell_object.add_paragraph() if section.type == MD_TYPE_HORIZONTAL_LINE: md_hr.invoke(self.cell_object, section) continue # Add a block (newline) if not called from a wrapper # (Should come after hr) if not invoked_from_wrapper: self.cell_object.add_paragraph() if section.type in MD_TYPES_HEADERS: # We want to keep the h{1...6} for styling insert_header(self.cell_object, section.contents, header=section.type, style=section.get_style()) continue if section.type in [MD_TYPE_TEXT, MD_TYPE_INLINE_TEXT]: if invoked_from_wrapper: self.cell_object.add_run() if not section.contents: continue if '{date}' in section.contents: try: formatted_date = get_formatted_date('', section.layout) except ParserError as e: formatted_date = 'n/a' section.contents = section.contents.replace( '{date}', formatted_date) insert_text(self.cell_object, section) continue if section.type == MD_TYPE_LINK: md_link.invoke(self.cell_object, section) continue if section.type == MD_TYPE_IMAGE: md_image.invoke(self.cell_object, section) continue if DEBUG: raise ValueError( f'Section type is not defined: {section.type}')
def __get_company_name(text): jq = PyQuery(text, parser='html') return jq.find('#zhizhao').find('.xinxi').find('tr').eq(0).find( 'td').eq(1).find('span').text().strip()
elif tag == 'span' and attrs.__contains__(('class', 'event-location')): self.key['event-location'] = True elif tag == 'h3' and attrs.__contains__(('class', 'event-title')): self.key['event-title'] = True def handle_data(self, data): if self.key['time']: print 'Time:%s\t|' % data, self.key['time'] = None elif self.key['event-title']: print 'Title:%s\t|' % data, self.key['event-title'] = None elif self.key['event-location']: print 'Location:%s\t|' % data self.key['event-location'] = None parser = MyHTMLParser() html = urllib.urlopen('http://www.python.org/events/python-events/').read() parser.feed(html) from pyquery import PyQuery doc = PyQuery(url='https://www.python.org/events/python-events/') for event in doc('.list-recent-events li'): event = PyQuery(event) loc = event.find('.event-location').text() time = event.find('time').text() name = event.find('.event-title').text() print 'event:%s' % name print '\ttime:%s' % time print '\tlocation:%s' % loc
def next(self): if self.i == self.episodes_iter.length: raise StopIteration # Index all episodes link = self.episodes_iter[self.i] # Parse the current episode from the long list of episodes article = PyQuery(link) episode = article.find('a.playLink') full_url = self.crawler.baseurl + article.find('a.playLink').attr('href') broadcasted = article.find('time').attr('datetime') episode_date = parse(broadcasted).replace(tzinfo=None) published = article.attr('data-published') if self.crawler.skip_urls: if full_url in self.crawler.skip_urls: self.i += 1 return self.next() if published.find('idag') != -1: published = '%s' % datetime.today() if published.find(u'ikväll') != -1: self.i += 1 return self.next() if published.find(u'igår') != -1: published = '%s' % (datetime.today() - timedelta(days=1)) try: published_date = parse(published, parserinfo=sverje()).replace(tzinfo=None) except ValueError as err: print err print published if self.crawler.min is not None: if published_date < self.crawler.min: self.i += 1 return self.next() if self.crawler.max is not None: if published_date > self.crawler.max: self.i += 1 return self.next() if len(broadcasted) < 1: broadcasted = '1970-01-01 00:00:00' #Check if the url contains an extra /Random-Title, if so, remove it if len(full_url.split('/')) == 6: url = full_url.rpartition('/')[0] else: url = full_url if (url.find('video') != -1 or url.find('klipp') != -1) and len(broadcasted) > 1: available = parse_date(article.attr('data-available'), '+') length = article.attr('data-length') if not episode.attr('href').startswith('http'): try: # Get the episode from url article_full = PyQuery(url) thumbnail = article_full.find('img.svtHide-No-Js').eq(0).attr('data-imagename') meta = article_full.find('.playBoxConnectedToVideoMain div') episode = Episode() desc = article_full.find('.playBoxConnectedToVideoMain p').text() if desc is not None: if len(desc) == 0: desc = article_full.find('.playBoxConnectedToVideoMain span') desc = sanitize_description(unicode(desc)) if str(meta).find('Kan endast ses i Sverige') == -1: rights = 1 else: rights = 2 if str(meta).find('Kan ses i mobilen') > -1: on_device = 1 else: on_device = 2 try: episodeTitle = article_full.find('title').eq(0).text().replace('| SVT Play', '') episode.url = url episode.title = episodeTitle episode.published = published episode.published_date = published_date episode.title_slug = shellquote(episodeTitle) episode.http_status = 200 episode.http_status_checked_date = datetime.utcnow().replace(tzinfo=utc) episode.date_available_until = available episode.date_broadcasted = broadcasted episode.length = length episode.description = desc episode.viewable_on_device = on_device episode.viewable_in = rights episode.kind_of = self.kind_of episode.thumbnail_url = thumbnail except AttributeError: self.i += 1 return self.next() self.i += 1 return episode except HTTPError as err: self.i += 1 return self.next()
def get_brand(code): q = PyQuery("https://kabutan.jp/stock/?code=7203") sector = q.find('#stockinfo_i2 > div > a')[0].text print(sector) print(code)
import requests from pyquery import PyQuery import pickle from helper.move import * moves = [] for x in range(1, 8): data = requests.get("https://pokemondb.net/move/generation/" + str(x)) src = PyQuery(data.text) trs = src.find('.ent-name') length = len(moves) i = length for tr in trs: moves.append([]) moves[i].append(tr.text) i += 1 trs = src.find('.type-icon') i = length for tr in trs: moves[i].append(tr.text) i += 1 trs = src.find('td:nth-child(3)') i = length for tr in trs: if tr.attrib["data-sort-value"] == "special": moves[i].append(1) elif tr.attrib["data-sort-value"] == "physical":
def getBasicInfo(self): self.INFO['shopId'] = re.findall(REGX['shopId'], self.content) self.INFO['userId'] = re.findall(REGX['userId'], self.content) self.INFO['shopName'] = re.findall(REGX['shopname'], self.content) self.INFO['shopLink'] = re.findall(REGX['shopLink'], self.content) self.INFO['wangwangNick'] = re.findall(REGX['wangwang'], self.content) self.INFO['shopRank'] = re.findall(REGX['shopRank'], self.content) self.INFO['shopGrade'] = re.findall(REGX['shopGrade'], self.content, re.S) self.INFO['shopRate'] = re.findall(REGX['shopRate'], self.content) self.INFO['shopKeeper'] = re.findall(REGX['shopKeeper'], self.content, re.S) self.INFO['company'] = re.findall( REGX['company'], self.content.decode(self.res.encoding, 'ignore'), re.S) self.INFO['location'] = re.findall(REGX['location'], self.content, re.S) self.INFO['goodsRate'] = re.findall( REGX['goodsRate'], self.content.decode(self.res.encoding, 'ignore'), re.S) self.INFO['itemAmount'] = re.findall( REGX['itemAmount'], self.content.decode(self.res.encoding, 'ignore'), re.S) self.INFO['setupTime'] = re.findall( REGX['setupTime'], self.content.decode(self.res.encoding, 'ignore'), re.S) self.INFO['shopType'] = re.findall( REGX['shopType'], self.content) or re.findall( r'"*siteId"*:\s*[\'\"](\d+)[\'\""]', self.content) if self.INFO['shopType'][0] in ['4']: self.INFO['shopId'] = re.findall(REGX_4[r'shopId'], self.content) self.INFO['userId'] = re.findall(REGX_4['userId'], self.content) for (k, v) in self.INFO.items(): if v: if isinstance(v, list): if k == 'shopRate': self.INFO[k] = ','.join(self.INFO[k]) elif k == 'shopLink': if len( re.findall(r'http\:\/\/store\.taobao\.com', self.INFO[k][0])) == 1: try: self.INFO[k] = \ (re.findall( r'\<a\s+class\=\"hCard\sfn\"\s+href\=\"(.+?)\"', self.content)[0])[:-1] except: self.INFO[k] = self.INFO[k][0] else: self.INFO[k] = self.INFO[k][0] else: self.INFO[k] = _trim_html(v[0]) if k in [ 'company', 'goodsRate', 'itemAmount', 'setupTime' ]: self.INFO[k] = self.INFO[k].encode( 'utf-8', 'ignore') else: self.INFO[k] = \ self.INFO[k].\ decode(self.res.encoding, 'ignore').\ encode('utf-8', 'ignore') else: self.INFO[k] = None else: self.INFO[k] = None if not self.INFO['userId']: for regx in [r'userId\s*\=\s*(\d+)']: regxrs = re.findall(regx, self.content, re.S) if len(regxrs) > 0: self.INFO['userId'] = regxrs[0] break if not self.INFO['shopLink']: for regx in \ [r'\<h3\s+class\=\"shop\-title\"\s*\>\s*\<a.+?href\=\"(.+?)\"', r'\<a\sclass\=\"hCard\sfn\s*\"\shref\s*\=\s*\"(.+?)\"', r'\<a\s+class\=\"shop-name\s*\"\s+href\s*\=\s*\"(.+?)\"']: regxrs = re.findall(regx, self.content, re.S) if len(regxrs) > 0: self.INFO['shopLink'] = regxrs[0] break # print re.findall(r'<title>(.+?)</title>', # self.content.decode(self.res.encoding), re.S)[0].encode('utf-8') if not self.INFO['shopName']: for regx in [ r'<div\s+class\=\"name\"\s*\>\s*\<span\>\s*(.+?)\<', r'\<h3\s+class\=\"shop\-title\"\s*\>\s*\<a.+?.+?\>\s*(.+?)\<', r'\<a\s+class\=\"shop-name\s*\"\s*href\=.+?\>(.+?)\<\/a\>', r'<title>(.+?)</title>' ]: shopName = re.findall(regx, self.content.decode(self.res.encoding), re.S) if len(shopName) > 0: self.INFO['shopName'] = shopName[0] break self.INFO['shopName'] = re.sub(r'(\<span.*?\>).+?(\<\/span\>)', r'', self.INFO['shopName'] or '', re.S) self.INFO['shopName'] = self.INFO['shopName'].encode( 'utf-8').replace('首页-', '').replace( '-淘宝网', '') if self.INFO['shopName'] else None if not self.INFO['wangwangNick']: for regx in [r'data-nick\s*=\s*"(.+?)"']: wangwangNick = re.findall(regx, self.content, re.S) if len(wangwangNick) > 0: self.INFO['wangwangNick'] = unquote(wangwangNick[0]) break if not self.INFO['wangwangNick']: self.INFO['wangwangNick'] = self.INFO['shopName'] if not self.INFO['shopKeeper']: self.INFO['shopKeeper'] = self.INFO['wangwangNick'] if not self.INFO['shopName']: self.INFO['shopName'] = self.INFO['wangwangNick'] if not self.INFO['shopRank']: self.INFO['shopRank'] = re.findall(r'/newrank/(.+?)\.gif', self.content) self.INFO['shopRank'] = self.INFO['shopRank'][0] if self.INFO[ 'shopRank'] else None pyjq_obj = PyQuery(self.content.decode(self.res.encoding)) if not self.INFO['itemAmount'] and self.INFO['shopType'] != '2': shop_intro = pyjq_obj.find("div.shop-intro") if shop_intro: self.INFO['itemAmount'] = \ int(list(shop_intro.items("dl"))[2].find("dd span").html()) if self.INFO['shopType'] == '2': if not self.INFO['shopLink']: shopLink = re.search(r'href="(http://\S+?.tmall.com)"', self.content, re.S) if shopLink: self.INFO['shopLink'] = shopLink.group(1) if not self.INFO['company']: company = pyjq_obj.find('div.extend ul li') if len(company) >= 3: company = company[2].find('div').text.encode( 'utf-8').strip('\n\r\t ') self.INFO['company'] = company if len( company) < 200 else None else: pass if not self.INFO['location']: i = pyjq_obj.text().find('所在地区'.decode('utf-8')) j = pyjq_obj.text().find('宝贝'.decode('utf-8'), i) self.INFO['location'] = pyjq_obj.text()[i + 5:j].encode('utf-8') self.INFO['location'] = self.INFO['location'].strip('-').strip() \ if self.INFO['location'] and len(self.INFO['location']) < 20 \ else None if not self.INFO['location']: loca_html = pyjq_obj.find('.locus') if loca_html: loca = loca_html.text().lstrip( '所 在 地:'.decode('utf-8')).lstrip('所 在 地:'.decode('utf-8')) self.INFO['location'] = loca.encode('utf-8') if self.INFO['location']: self.INFO['location'] = self.INFO['location'].strip('-') self.INFO['location2'] = _parse_loca(self.INFO['location']) return self.INFO
def __get_data(self): resp = self.session.get(reportURL) doc = PyQuery(resp.text) html = doc.html() tiwen = 36.5 + random.uniform(0, 0.3) tiwen = round(tiwen, 1) zxMatch = re.findall(r'f8_state={.*?"SelectedValue":"(.+?)"', html)[0] gnMatch = re.findall(r'f14_state={.*?"SelectedValue":"(.+?)"', html)[0] shengMatch = re.findall(r'f16_state={.+?"SelectedValueArray":\["(.+?)"]', html)[0] shiMatch = re.findall(r'f17_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0] xianMatch = re.findall(r'f18_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0] # print(shiMatch) xxMatch = re.findall(r'f20_state={.*?"Text":"(.+?)"', html)[0] F_State = template % ( self.date, zxMatch, gnMatch, shengMatch, shiMatch[0], shiMatch[1], xianMatch[0], xianMatch[1], xxMatch, "否") return { 'F_State': base64.b64encode(F_State.encode()), '__VIEWSTATE': doc.find('#__VIEWSTATE').attr('value'), '__EVENTTARGET': 'p1$ctl00$btnSubmit', '__EVENTARGUMENT': '', '__VIEWSTATEGENERATOR': doc.find('#__VIEWSTATEGENERATOR').attr('value'), 'p1$ChengNuo': 'p1_ChengNuo', 'p1$BaoSRQ': self.date, 'p1$DangQSTZK': '良好', 'p1$TiWen': str(tiwen), 'F_TARGET': 'p1_ctl00_btnSubmit', 'p1_Collapsed': 'false', 'p1$CengFWH_RiQi': '', 'p1$CengFWH_BeiZhu': '', 'p1$JieChu_RiQi': '', 'p1$JieChu_BeiZhu': '', 'p1$TuJWH_RiQi': '', 'p1$TuJWH_BeiZhu': '', 'p1$JiaRen_BeiZhu': '', 'p1$ZaiXiao': zxMatch, "p1$MingTDX": "不到校", "p1$MingTJC": "否", "p1$BanChe_1$Value": '0', "p1$BanChe_1": '不需要乘班车', "p1$BanChe_2$Value": '0', "p1$BanChe_2": '不需要乘班车', 'p1$GuoNei': '国内', "p1$ddlGuoJia$Value": "-1", "p1$ddlGuoJia": "选择国家", 'p1$ddlSheng$Value': shengMatch, 'p1$ddlSheng': shengMatch, 'p1$ddlShi$Value': shiMatch[1], 'p1$ddlShi': shiMatch[1], 'p1$ddlXian$Value': xianMatch[1], 'p1$ddlXian': xianMatch[1], 'p1$XiangXDZ': xxMatch, "p1$FanXRQ": "", "p1$WeiFHYY": "", "p1$ShangHJZD": "", 'p1$QueZHZJC$Value': '否', 'p1$QueZHZJC': '否', 'p1$DangRGL': '否', # 是否隔离 'p1$DaoXQLYGJ': '', # 旅游国家 'p1$DaoXQLYCS': '', # 旅游城市 'p1$Address2': '中国', 'p1$SuiSM': '绿色', # 随申码颜色 'p1$LvMa14Days': '是', # 截止今天是否连续14天健康码为绿色 'p1$GeLDZ': '', "p1_SuiSMSM_Collapsed": "false", "p1_GeLSM_Collapsed": 'false', "p1_SuiSMSM_Collapsed": 'false' }
def test_ongoing_events_in_event_list(self, managers_timezone_mock, tag_timezone_mock): managers_timezone_mock.now.return_value = tz_datetime( 2014, 4, 7, 9, 30) tag_timezone_mock.now.return_value = tz_datetime(2014, 4, 7, 9, 30) root_page = self.create_root_page( publication_date=tz_datetime(2014, 4, 1)) root_page.publish('en') page = api.create_page(title='Events en', template=self.template, language='en', published=True, parent=root_page, apphook='EventListAppHook', apphook_namespace=self.app_config.namespace, publication_date=tz_datetime(2014, 4, 1)) page.publish('en') # happens in Apr 5 ev1 = self.create_event(title='ev1', start_date=tz_datetime(2014, 4, 5), publish_at=tz_datetime(2014, 4, 1)) # Apr 6 12:00 to Apr 7 9:00 ev2 = self.create_event(title='ev2', start_date=tz_datetime(2014, 4, 6), end_date=tz_datetime(2014, 4, 7), start_time='12:00', end_time='09:00', publish_at=tz_datetime(2014, 4, 2)) # happens in Apr 7 ev3 = self.create_event(title='ev3', start_date=tz_datetime(2014, 4, 7), publish_at=tz_datetime(2014, 4, 3)) # happens in Apr 8 ev4 = self.create_event(title='ev4', start_date=tz_datetime(2014, 4, 8), publish_at=tz_datetime(2014, 4, 4)) # setUp app config original_app_data = self.app_config.app_data.copy() self.app_config.app_data = {'config': {'show_ongoing_first': True}} self.app_config.save() with force_language('en'): response = self.client.get(page.get_absolute_url('en')) context = response.context_data # tearDown app config self.app_config.app_data = original_app_data self.app_config.save() actual_ongoing = [event.pk for event in context['ongoing_objects']] expected_ongoing = [event.pk for event in [ev2, ev3]] self.assertEqual(actual_ongoing, expected_ongoing) actual_object_list = [event.pk for event in context['object_list']] expected_object_list = [event.pk for event in [ev4, ev1]] self.assertEqual(actual_object_list, expected_object_list) ongoing_list = PyQuery(response.content)('.events-upcoming') links = ongoing_list.find('h2 a') self.assertEqual(2, links.length) self.assertEqual(ev4.get_absolute_url(), links[0].attrib['href']) self.assertEqual(ev1.get_absolute_url(), links[1].attrib['href'])
# -*- coding: utf-8 -*- from pyquery import PyQuery q = PyQuery('https://kabutan.jp/stock/?code=7203') sector = q.find('#stockinfo_i2 > div > a')[0].text print(sector)
def isMetaRefresh(self, this): httpEquiv = PyQuery(this).attr['http-equiv'] return (httpEquiv and httpEquiv.find('refresh') > -1)
def get_search_list_html(self, keyword, session): param_list = [] try: session.headers = { "Host": "gsxt.zjaic.gov.cn", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Referer": "http://zj.gsxt.gov.cn/client/entsearch/list?isOpanomaly=&pubType=1&searchKeyWord=0B46FE9E9DBAF27F¤tPage=2", } # 先获得加密关键字信息 script = "strEnc('{keyword}','a','b','c')".format(keyword=keyword) search_key_word = self.get_encry_pripid_detail( encry_zj_conf['url'], script) if search_key_word is None: return param_list, self.SEARCH_ERROR search_url = 'http://{host}/client/entsearch/list?isOpanomaly=&pubType=1&searchKeyWord={searchkey}'.format( host=self.host, searchkey=search_key_word) r = self.task_request(session, session.get, url=search_url) if r is None: return param_list, self.SEARCH_ERROR content = r.text if content is None: return param_list, self.SEARCH_ERROR # 这个IP已经被封禁 if util.judge_feature(content): self.report_session_proxy(session) return param_list, self.SEARCH_ERROR jq = PyQuery(content, parser='html') # 先判断有多少数据 if jq.find('h3.title').find('span.light').text() == '0': return param_list, self.SEARCH_NOTHING_FIND item_list = jq.find('div.mod.enterprise-info').find( '.enterprise-info-list').find('li').items() for item in item_list: a_info = item.find('a') if a_info is None or len(a_info) <= 0: continue href = a_info.attr('href') if href is None or href == '': continue a_info.find('span[class=tip]').remove() a_info.find('i').remove() company = a_info.text() search_name = company.replace(' ', '') if search_name == '': return None param = { 'Referer': search_url, 'href': href, 'search_name': search_name, } seed_code = None code_text = item.find('.item-text').find('.code').text() if code_text is not None and code_text.strip() != '': part = code_text.split(':') if len(part) >= 2: seed_code = part[1] if seed_code is not None and seed_code.strip() != '': param['unified_social_credit_code'] = seed_code param_list.append(param) except Exception as e: self.log.exception(e) return param_list, self.SEARCH_ERROR return param_list, self.SEARCH_SUCCESS if len( param_list) > 0 else self.SEARCH_ERROR