def test_calendar_tag_rendering(self, timezone_mock): timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12) page_with_apphook = self.create_base_pages() other_config = EventsConfig.objects.create(namespace='other') self.create_event( title='ev1', start_date=tz_datetime(2015, 1, 13), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( title='ev2', start_date=tz_datetime(2015, 1, 15), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( de=dict( title='ev3', start_date=tz_datetime(2015, 1, 16), publish_at=tz_datetime(2015, 1, 10) ) ) self.create_event( title='ev4', start_date=tz_datetime(2015, 1, 18), publish_at=tz_datetime(2015, 1, 10), app_config=other_config ) self.create_event( title='ev5', start_date=tz_datetime(2015, 1, 22), end_date=tz_datetime(2015, 1, 27), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( title='ev6', start_date=tz_datetime(2015, 1, 25), ) # make use of default tests self.app_config namespace, instead of # hard coding it template_str = """ {%% load aldryn_events %%} {%% calendar 2015 1 'en' '%s' %%} """ % self.app_config.namespace t = Template(template_str) with override('en'): html = t.render(SekizaiContext({})) table = PyQuery(html)('table.table-calendar') page_url_en = page_with_apphook.get_absolute_url() links = table.find('td.events, td.multiday-events').find('a') # test if tag rendered important elements self.assertEqual('1', table.attr('data-month-numeric'), ) self.assertEqual('2015', table.attr('data-year')) self.assertEqual('10', table.find('td.today').text()) self.assertEqual(8, links.length) # 13, 15, 22, 23, 24, 25, 26, 27 expected_days = (13, 15, 22, 23, 24, 25, 26, 27) for position, day in enumerate(expected_days): event_url = '{0}2015/1/{1}/'.format(page_url_en, day) rendered_url = links[position].attrib['href'] self.assertEqual(event_url, rendered_url)
def find_external_links(url): '''Look for links to files in a web page and returns a set. ''' links = set() try: response = get(url) if response.status_code != 200: app.logger.warning('Error while getting proxy info for: %s' 'Errors details: %s', url, response.text) else: if response.content: p = PyQuery(response.content) for anchor in p("a"): panchor = PyQuery(anchor) href = panchor.attr("href") if url_is_egg_file(href): # href points to a filename href = get_absolute_url(href, url) links.add('<a href="%s">%s</a>' % (href, panchor.text())) except: # something happened when looking for external links: # timeout, HTML parser error, etc. # we must not fail and only log the error app.logger.exception('') return links
def scrape_homepage(self, **kwargs): """ Scrape! """ logger.info('Scraping homepage (start time: %s)' % self.run_time) if not kwargs: response = requests.get(self.url) page = PyQuery(response.content) else: page = PyQuery(**kwargs) article_elements = page('.stories-wrap article') slot = 0 articles = [] for el in article_elements: element = PyQuery(el) article = Article(element, self.run_time) if not article.story_id and not article.is_apps_project: continue if not element.hasClass('attachment'): slot += 1 article.slot = slot articles.append(article) logger.info('Scraped %s from homepage (%s)' % (article.story_id, article.headline)) return articles
def test_render_attribute_to_document(): document = PyQuery('<a attribute="{ value }" data-riot-id="0"></a>') expression = {'expression': '{ value }', 'attribute': 'attribute', 'type': 'attribute', 'node': document} render_document([expression], {'value': 'value'}) assert document.outer_html() == '<a attribute="value" data-riot-id="0" data-riot-dirty="true"></a>' render_document([expression], {'value': 1}) assert document.outer_html() == '<a attribute="1" data-riot-id="0" data-riot-dirty="true"></a>'
def html_to_records(html): pq = PyQuery(html) rows = pq.find('table tr') get_row = lambda r: map(lambda th: th.text, r) headers = get_row(rows[0]) for row in rows[1:]: yield dict(zip(headers, get_row(row)))
def extract(self): self.html = re.sub("<!--.*?-->", "", self.html) doc = PyQuery(self.html) content_node = doc("div#blog_article_content") content = content_node.outerHtml() cpl = re.compile('<img.*?src=".*?"', re.I) content = re.sub("%", "%%", content) content_doc = PyQuery(content) content_doc("img").attr("src", "%s") item = ContentItem() item["title"] = self.title = doc("div.blog_main_left_content").find("h3").text() item["author"] = self.author = doc("div#common_person_blogtitle")("div#title01")("a").text() item["content"] = self.content = content_doc.outerHtml() self.release_time = doc("div.blog_main_time").find("p").text().strip() item["release_time"] = self.release_time item["source"] = u"凤凰网" item["pic_url"] = "" item["image_urls"] = [img.get("src") for img in content_node("img")] return item
def get_saml_response(response): tree = PyQuery(response.content) inputtag = tree.find('input[name="SAMLResponse"]') assert len(inputtag) == 1 encoded_response = inputtag[0].get('value') samlresponse = base64.b64decode(encoded_response) return samlresponse
def next(self): if self.i == self.categories_iter.length: raise StopIteration link = self.categories_iter[self.i] py_link = PyQuery(link) href = py_link.attr('href') html_class = href.split('/')[-1:][0] title = py_link.text() thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src') url = self.crawler.category_url % href category = Category(title, url, html_class, thumbnail_url) shows = Shows(self.crawler, url) tmp = list() tmp.append(shows) if title == 'Nyheter': news_url = self.crawler.news_url % href news_shows = Shows(self.crawler, news_url) tmp.append(news_shows) category.shows = itertools.chain(*tmp) self.i += 1 return category
def get_urls(): doc = PyQuery('https://www.peterbe.com/plog/') doc.make_links_absolute(base_url='https://www.peterbe.com') urls = [] for a in doc('dd a'): urls.append(a.attrib['href']) return urls
def extract_sample_and_codes_using_css(doc, sample_css_class): sample_columns = {} variable_column = None sample_columns = {} code_column = None label_column = None table = doc('#dataTable > .variablesList') for i, th in enumerate(table('tr.fullHeader:first > th')): th = PyQuery(th) if code_column is None and th.hasClass('codesColumn'): code_column = i elif label_column is None and th.hasClass('labelColumn'): label_column = i elif th.hasClass(sample_css_class): sample_columns[i] = th.text() # Extract actual values for the variables. variable_info = [] for row in table('tr.variables'): columns = PyQuery(row)('td') code = PyQuery(columns[code_column]).text().strip() label = PyQuery(columns[label_column]).text().strip() availability = [smpl.strip() for i, smpl in sample_columns.items() if PyQuery(columns[i]).text().strip() != '.'] variable_info.append({'code': code, 'label': label, 'availability': availability}) return variable_info, len(sample_columns)
def getResTb(): html = fileworker.getHTML() pq = PyQuery(html) result = dict() blocks = list() for i in pq.items('.row.result'): list.append(i)
def crawl_vvic_category_tree(wb): h = httplib2.Http() response, content = h.request("http://www.vvic.com/") # fw = open("C:users/chenweiqiang/desktop/vvic2.html", "w") # fw.write(content) # fw.close() ws = wb.add_sheet("vvic品类树") ws.write(0,0,"一级品类") ws.write(0,1,"二级品类") ws.write(0,2,"三级品类") row = 0 doc = PyQuery(content) level1NodeList = doc("div.dd-inner > div.item") anotherLevel1NodeList = [doc('div.sub-items')[0], doc('div.sub-items')[1], doc('div.sub-items')[2], doc('div.sub-items')[5]] for index, level1Node in enumerate(level1NodeList): level1_category = PyQuery(level1Node)('h3 > a').text() level2NodeList = PyQuery(anotherLevel1NodeList[index]).children('dl') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) level2_category = level2NodeQ.children('dt > a').text() level3NodeList = level2NodeQ.children('dd > a') for level3Node in level3NodeList: level3_category = PyQuery(level3Node).text() row += 1 ws.write(row, 0, level1_category) ws.write(row, 1, level2_category) ws.write(row, 2, level3_category)
def crawl_1688_category_tree(wb): #fr = open("C:users/chenweiqiang/desktop/ye.html", "r") #PyQuery之后取不出来元素 h = httplib2.Http() response, content = h.request("https://ye.1688.com/") # fw = open("C:users/chenweiqiang/desktop/ye2.html", "w") # fw.write(content) # fw.close() ws = wb.add_sheet("ye.1688品类树") ws.write(0,0,"一级品类") ws.write(0,1,"二级品类") ws.write(0,2,"三级品类") row = 0 doc = PyQuery(content) level1NodeList = doc("li.cat-box") for level1Node in level1NodeList: level1NodeQ = PyQuery(level1Node) level1_category = level1NodeQ('div.cat-main').text().replace(' ', '') level2NodeList = level1NodeQ('div.cat-sub-col > dl') # 多余div[class="cat-sub "] > for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) level2_category = level2NodeQ('dt > a').text() level3NodeList = level2NodeQ('dd.cat-list > ul > li > a') for level3Node in level3NodeList: level3NodeQ = PyQuery(level3Node) level3_category = level3NodeQ.text() row += 1 ws.write(row, 0, level1_category) ws.write(row, 1, level2_category) ws.write(row, 2, level3_category)
def station_parse(content): '''Parsing bus station and check station. ''' OFF = '0-0' stations = [] bus_status = [] content = json.loads(content[3:].decode('utf-8')) status = content['status'] info = content['info'] if status == 1 and info != '': pqContent = PyQuery(info)('#upInfo li') for station in pqContent: pqStation = PyQuery(station) station_name = pqStation('.station').text() stations.append(station_name) buses = pqStation.find('.bus') if buses.size() > 0: left_count = 0 on_station_count = 0 for bus in buses: if PyQuery(bus).attr('style'): left_count+=1 else: on_station_count+=1 bus_status.append('{0}-{1}'.format(on_station_count, left_count)) else: bus_status.append(OFF) if not stations: return None return (tuple(bus_status), tuple(stations))
def parseProductPage(product, need_img_urls=False): """进入商品详情页, 抓取四个新字段 delivery reviews star total_sales """ if product['product_url']: content = fetchContent(product['product_url'], False) doc=PyQuery(content) #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了 product['reviews'] = doc('p.satisfaction-number > a > em.value').text() product['star'] = doc('p.star-level > i').attr("class") product['total_sales'] = doc('p.bargain-number > a > em.value').text() if need_img_urls: url_list = get_img_urls(content) product['img_urls'] = ', '.join(url_list) else: product['img_urls'] = '' product['color'], product['size'] = '', '' for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')): tdQ = PyQuery(td) if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色': product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸': product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", "")) if not product['MOQ'] or product['MOQ'] == 0: product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text()) if product['MOQ'] == 1: #print product['product_url'] product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text() product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text() product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text() product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text() print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount'] return product
def parse(html): '''return a list of dictionaries describing the stories on the front page''' elements = [] p = PyQuery(html) # 90s markup woohoo! anchors = p('.title:nth-child(3) a:nth-child(1)') for a in anchors: # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated a = PyQuery(a) subtext = a.closest('tr').next().find('.subtext') if not subtext: # More link continue children = map(PyQuery, subtext.children()) try: span, submitted, comments = children[0], children[1], children[-1] except IndexError: # filter out ads continue comments = comments.text().rpartition(' ')[0] comments = int(comments) if comments else 0 url = a.attr('href') elements.append({ 'pos': len(elements) + 1, 'title': a.text(), 'url': url, 'domain': urlparse(url).netloc.rpartition('www.')[2], 'comments': comments, 'submitter': submitted.text(), 'points': int(span.text().split()[0]), 'id': int(span.attr('id').split('_', 1)[1]), 'ago': submitted[0].tail.split('ago')[0].strip(), }) logging.warning('parsed %s elements', len(elements)) return elements
def _split(inputfile, outputdir): source = open(inputfile, 'r') html = source.read() source.close() if not os.path.isdir(outputdir): os.mkdir(outputdir) idx_slide=0 idx_section=0 parsed = PyQuery(html) for section in parsed('section'): slide = PyQuery(section) if slide.has_class('stack'): idx_section+=1 stack_path = os.path.join(outputdir,'%02d' % idx_section ) os.mkdir(stack_path) for sub_slide in PyQuery(slide.html())('section'): idx_slide+=1 _dump_slide(sub_slide, idx_slide, stack_path) else: if not slide.parent().has_class('stack'): idx_slide+=1 _dump_slide(slide, idx_slide, outputdir)
def build_tree(self, doc, xpath, parent=None, prefix=""): tree = [] nodes = doc.xpath(xpath) for (i, node) in enumerate(nodes): link = node.xpath(self.link_xpath) if not link: continue title, url = link[0].text, link[0].get("href") if not title: # when <code> inside it, link[0].text dose not work properly html = etree.tostring(link[0]).decode() pq = PyQuery(html) title = pq.text() title = re.sub(r"\s+", " ", title).strip() tree_node = TreeNode(title, url) tree_node.children = self.build_tree(node, self.children_xpath, tree_node, "{}{}.".format(prefix, i + 1)) if url: tree_node.abs_url = urllib.parse.urljoin(self.index_url, tree_node.url) # tree_node.save_to = "{}{}_{}.html".format(prefix, i + 1, filter_illegal_path_chars(tree_node.title)) tree_node.save_to = filter_illegal_path_chars(url_path_to_filename(tree_node.abs_url)) elif tree_node.children: # if parent doesn't have url, then set it to it's first child's url tree_node.abs_url = tree_node.children[0].abs_url tree_node.save_to = tree_node.children[0].save_to else: self.logger.warning("no children and no link?") continue tree.append(tree_node) return tree or None
def getPageLinkIfValid(element, currentPageNumber): pyElement = PyQuery(element) pageNumberText = pyElement.find('span').text() if pageNumberText.isdigit() and int(pageNumberText) > currentPageNumber: return 'https://www.youtube.com' + pyElement.attr('href') return None
def get_smileys(): global n logging.info('Récupération des émoticones') n = 0 d = PyQuery(url=config.rooturl+'/admin/index.forum?part=themes&sub=avatars&mode=smilies&extended_admin=1&' + tid, opener=fa_opener) result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text()) try: pages = int(result.group(1)) usersperpages = int(result.group(2)) except: pages = 1 usersperpages = 0 if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar(widgets=[BarVar(), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=pages-1) progress.start() for page in range(0,pages): if page >= 1: d = PyQuery(url=config.rooturl + '/admin/index.forum?part=themes&sub=avatars&mode=smilies&extended_admin=1&start=' + str(page*usersperpages) + '&' + tid, opener=fa_opener) for i in d('table tr'): e = PyQuery(i) if e("td").eq(0).text() != None and e("td").eq(0).attr("colspan") == None: save.smileys[e("td").eq(0).text()] = e("td").eq(1).text() n += 1 progress.update(page) progress.end()
def process_file(filename): """Read a file from disk and parse it into a structured dict.""" try: with codecs.open(filename, encoding='utf-8', mode='r') as f: file_contents = f.read() except IOError as e: log.info('Unable to index file: %s, error :%s', filename, e) return data = json.loads(file_contents) sections = [] title = '' body_content = '' if 'current_page_name' in data: path = data['current_page_name'] else: log.info('Unable to index file due to no name %s', filename) return None if 'body' in data and data['body']: body = PyQuery(data['body']) body_content = body.text().replace(u'¶', '') sections.extend(generate_sections_from_pyquery(body)) else: log.info('Unable to index content for: %s', filename) if 'title' in data: title = data['title'] if title.startswith('<'): title = PyQuery(data['title']).text() else: log.info('Unable to index title for: %s', filename) return {'headers': process_headers(data, filename), 'content': body_content, 'path': path, 'title': title, 'sections': sections}
def _parse_table(self, table): # Initialize table parsed_rows = [] # Parse table qtable = PyQuery(table) # Get headers headers = self._get_headers(qtable) if not headers: return # Get rows rows = qtable.find("tr") # Loop over rows for row in rows: # Get columns qrow = PyQuery(row) cols = qrow.find("td").map(self._get_text)[:] # Parse column values for colidx in range(len(cols)): col = reduce(lambda x, y: re.sub(y[0], y[1], x), self._trans, cols[colidx]) cols[colidx] = col # Append parsed columns if cols: parsed_rows.append(cols) return {"headers": headers, "data": parsed_rows}
def list_page(self, response): result_content = {} content_iter = re.finditer(r"STK && STK.pageletM && STK.pageletM.view\((?P<content>\{.*?\})\)", response.content) for iter in content_iter: ok, content = safe_loads(iter.groupdict()['content']) if ok and "pl_weibo_direct" == content.get("pid"): result_content = content break else: return {} pyquery_doc = PyQuery(result_content["html"]) pyquery_doc.make_links_absolute(response.url) items = [] for item in pyquery_doc("DIV.feed_lists>DIV.WB_cardwrap>DIV").items(): weibo_href = item("DIV.content>DIV.feed_from>A").attr.href if weibo_href: weibo_pics = [] for pic in item("DIV.feed_content DIV.media_box IMG").items(): weibo_pics.append(pic.attr.src) data = { "content": item("DIV.feed_content P.comment_txt").text(), "nickname": item("DIV.feed_content A.W_texta").attr.title, "href": weibo_href, "quote_nickname": item("DIV.feed_content DIV.comment DIV.comment_info A.W_texta").attr.title, "quote_content": item("DIV.feed_content DIV.comment DIV.comment_info P.comment_txt").text(), "pics": ''.join(weibo_pics) } self.crawl("data:,%s" % weibo_href, callback = self.detail_page, data_fetch_content=data)
def update_forums(client, group, session): logging.info("Updating forums list for {}".format(group)) query = Forum.get_forum_page(client, group.gid) reg = regex.compile(r"^forum\.php\?mod=forumdisplay&fid=(\d+)$") for row in query.find("table.fl_tb>tr"): sub_query = PyQuery(row) href = sub_query.find("td").eq(1).find("a").attr("href") if not href: continue fid = int(reg.findall(href)[0]) name = sub_query.find("td").eq(1).find("h2>a").clone().children().remove().end().text() last_update = sub_query.find("td").eq(3).find("div>cite").clone().children().remove().end().text() last_update = dateparse(last_update) existence = session.query(Forum).filter(Forum.fid == fid) if existence.count() == 0: logging.info("<Forum(fid={})> not found, creating one".format(fid)) forum = Forum(fid=fid, name=name, updated_at=last_update, group=group, fresh=False) session.add(forum) else: forum = existence.one() if forum.updated_at != last_update: logging.info("{} found, stale: against {} ".format(forum, last_update)) forum.updated_at = last_update forum.fresh = False session.add(forum) else: logging.info("{} found, fresh".format(forum))
def _enhance_text(self): """ Transforms a simplified text into a valid mail.template text. :return: mail.template text """ self.ensure_one() # Parse and set back the keywords into raw template code html_text = PyQuery(self.simplified_text.replace('\n', '')) def sort_keywords(kw): # Replace first if/for-clauses, then var, then code index = kw.position if kw.type == 'if' or 'for' in kw.type: index += 2*len(self.body_html) * kw.nested_position # Take if and for in the appearing order in the text index -= kw.position elif kw.type == 'var': index += len(self.body_html) return index keywords = self.keyword_ids.sorted(sort_keywords, reverse=True) # Replace automatic-generated keywords for keyword in keywords: keyword_text = html_text('#' + keyword.html_id) keyword_text.replace_with(keyword.final_text) # Replace user added keywords template_text = html_text.html() for keyword in keywords.filtered(lambda k: k.type == 'code'): to_replace = u"[{}]".format(keyword.short_code) template_text = template_text.replace(to_replace, keyword.raw_code) final_text = PyQuery(BeautifulSoup(template_text).prettify()) return final_text('body').html()
def get_meme_url(meme): gen = GENERATORS.get(meme) if gen: pq = PyQuery(url="http://memegenerator.net/%s" % gen[2]) return pq.find('a img.large').attr('src') else: return None
def __extract(self, html): pq = PyQuery(html).find("main#main #mainArea table") selector_ = "thead tr:eq(0) th" date_order = [PyQuery(v).text().split('\n')[0] for v in PyQuery(pq).find(selector_)][3:] result = {d: {} for d in date_order} index = 0 total = len(PyQuery(pq).find("tbody tr")) while index < total: td = PyQuery(pq).find("tbody tr:eq(%d) td:eq(0)" % index) room_type = td.text().split()[0] rowspan = int(td.attr('rowspan')) for i in xrange(index, index + rowspan): row = PyQuery(pq).find("tbody tr:eq(%d)" % i) # smoking or not smoking = PyQuery(row).find("td.alC.alM > img").attr("alt") room = "%s (%s)" % (room_type, smoking) if row.hasClass('clubCardCell'): member_type = 'member' else: member_type = 'guest' for i, v in enumerate(self.__extract_price_remain(row)): if room not in result[date_order[i]]: result[date_order[i]][room] = {} result[date_order[i]][room][member_type] = v index += rowspan return result
def get_bounds(scene_name): """Use Earth Explorer metadata to get bounds of a Scene""" url_code = get_metadata_code(scene_name) metadata = PyQuery( 'http://earthexplorer.usgs.gov/fgdc/%s/%s/' % (url_code, scene_name) ) metadata = metadata.text()[ metadata.text().find('G-Ring_Latitude:'): metadata.text().find('\n Keywords:') ] coords = ( metadata.replace(' ', '') .replace('G-Ring_Latitude:', '') .replace('G-Ring_Longitude:', '') .split('\n') ) coords = [float(coord) for coord in coords if coord != ''] # create a list of lists with the coordinates coords = [coords[i:i + 2] for i in range(0, len(coords), 2)] # use reverse() to change [lat, lon] to [lon, lat] [coord.reverse() for coord in coords] # repeat the first coordinate on the end of the list if coords[0] != coords[-1]: coords.append(coords[0]) return coords
def download(threadUrl): """ """ d = PyQuery(url=threadUrl, parser='soup') links = d('a[href^="job.php?action=download&aid="]') # 获取 verify 的值 tmp = d('script:contains("var verifyhash =")').text() verify = re.search(r"var verifyhash = '(.*?)'", tmp).group(1) total = len(links) d.make_links_absolute() for i, e in enumerate(links.items(), start=1): filename = e.text() print('%s/%s %s' % (i, total, filename)) if not os.path.exists(os.path.join(SAVE_PATH, filename)): params = urlencode( {'check': 1, 'verify': verify, 'nowtime': int(time.time() * 1000)}) url = '%s?%s' % (e.attr['href'], params) print(' fetch: ' + url) downDoc = PyQuery(url, headers=headers) # 第0个是电信下载点,第1个是移动下载点 downUrl = BASE_URL + downDoc('a[href^="remotedown.php"]').eq(1).attr('href') addToIDM(downUrl, SAVE_PATH, filename) time.sleep(1.5) wefiler_urls = checkWefiler(d) if wefiler_urls: print(wefiler_urls)
def get_forums(): logging.info('Récupération des forums') if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()]) d = PyQuery(url=config.rooturl + '/a-f1/', opener=fa_opener) save.forums = [] levels = {} n = 1 for i in progress([i for i in d.find("select option") if i.get("value", "-1") != "-1"]): id = i.get("value", "-1") logging.debug('Récupération: forum %s', id) title = re.search('(((\||\xa0)(\xa0\xa0\xa0))*)\|--([^<]+)', i.text).group(5) level = len(re.findall('(\||\xa0)\xa0\xa0\xa0', i.text)) if level <= 0: parent = 0 else: parent = levels[level-1] levels[level] = n d = PyQuery(url=config.rooturl+'/admin/index.forum?part=general&sub=general&mode=edit&fid=' + id + '&extended_admin=1&' + tid, opener=fa_opener) try: description = d("textarea").text() except: description = "" save.forums.append({'id': int(id[1:]), 'newid': n, 'type': id[0], 'parent': parent, 'title': title, 'description': description, 'parsed': False}) n += 1
def get_content(self, beta=False): url = self.url_list_betas if beta else self.url_list response = self.client.get(url) assert response.status_code == 200 return PyQuery(response.content)
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, user_agent=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, user_agent) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] tweets = PyQuery(json['items_html'])('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernameTweet = tweetPQ( "span.username.js-action-profile-name b").text() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") user_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.formatted_date = datetime.datetime.fromtimestamp( dateSec).strftime("%a %b %d %X +0000 %Y") tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo tweet.urls = ",".join(urls) tweet.author_id = user_id results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: print("active = False") active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
result = browser.page_source if companyname not in result: for cs in range(10): if companyname not in result: print("刷新页面") browser.refresh() browser.switch_to_alert().accept() time.sleep(4) html = browser.page_source else: break if companyname not in result: print("无该公司信息") browser.quit() break rq = PyQuery(result) alist = rq('.search_list_item') for i in alist: ad = PyQuery(i) t = ad('.f20').text().replace(" ", "") if ad('.f20').text() in companyname: xqurl = PyQuery(i).attr('href') break # with open('gjxy.html', 'r', encoding='utf') as f: # html = f.read() # f.close() headers = { "Accept-Language": "zh-CN,zh;q=0.9", "Accept-Encoding": "gzip, deflate",
option = webdriver.ChromeOptions() option.add_argument('headless') browser = webdriver.Chrome() base_url1 = r'https://cise.jsu.edu.cn/' base_url = r'https://cise.jsu.edu.cn/xygk/ggjs' all_page = ['.htm', '/1.htm', '/2.htm'] for i in all_page: url = base_url + i browser.get(url) page = browser.page_source h = PyQuery(page) all_pe = h( 'body > div.ntp > div > div > div.cla2 > div.cla22 > div.cla222 > div.gugan > div.gugan1 >a' ) for i in all_pe.items(): s = str(i.attr('href')) url1 = base_url1 + '/'.join(re.findall( r'\w+', s)[:-1]) + '.' + re.findall(r'\w+', s)[-1] r = browser.get(url1) k = PyQuery(browser.page_source) s = re.sub(r'<.+>', '', k('#vsb_content > div').text()) if s == '': s = '暂无' print(s[0:4]) print('====================') print(s)
def get_content(self): response = self.client.get(self.url_list) assert response.status_code == 200 return PyQuery(response.content)
def _zope_testbrowser_pyquery(self): from pyquery import PyQuery return PyQuery( self.contents.replace('xmlns="http://www.w3.org/1999/xhtml', ''))
def main(): global output_dir, generate_pdf, use_cache, max_retries, error_timeout, interactive parser = argparse.ArgumentParser( description='A downloader for the digi4school open library') parser.add_argument('-s', '--start', type=float, action='store', dest='start', default=0, required=False, help='Start the download at START percent') parser.add_argument('-e', '--end', type=float, action='store', dest='end', default=100, required=False, help='Stop the download at END percent') parser.add_argument( '-o', '--output-directory', type=str, action='store', dest='output_dir', default=output_dir, required=False, help='The directory into which the books should be downloaded') parser.add_argument('-g', '--generate-pdf', action='store_true', dest='generate_pdf', required=False, default=None, help='Generate a pdf when all files are downloaded') parser.add_argument( '-ng', '--no-generate-pdf', action='store_false', dest='generate_pdf', required=False, default=None, help='Do NOT generate a pdf when all files are downloaded') parser.add_argument('-u', '--use-cache', action='store_true', dest='use_cache', required=False, default=None, help='Use already downloaded (cached) extra files') parser.add_argument('-nu', '--no-use-cache', action='store_false', dest='use_cache', required=False, default=None, help='Download extra files again') parser.add_argument('-i', '--interactive', action='store_true', dest='interactive', required=False, default=None, help='Prompt before starting download') parser.add_argument( '-ni', '--no-interactive', action='store_false', dest='interactive', required=False, default=None, help='Do not prompt before starting download, start right away') parser.add_argument( '-m', '--max-retries', type=int, action='store', dest='max_retries', default=max_retries, required=False, help='Retry downloading MAX_RETRIES times before skipping the book') parser.add_argument( '-t', '--error-timeout', type=float, action='store', dest='error_timeout', default=error_timeout, required=False, help='Wait ERROR_TIMEOUT seconds before retrying the download') args = parser.parse_args() start = args.start end_percent = args.end output_dir = args.output_dir max_retries = args.max_retries error_timeout = args.error_timeout if args.generate_pdf is not None: generate_pdf = args.generate_pdf if args.use_cache is not None: use_cache = args.use_cache if args.interactive is not None: interactive = args.interactive if len(output_dir) < 1: output_dir = input("Output directory: ") Path(output_dir).mkdir(parents=True, exist_ok=True) signal.signal(signal.SIGUSR1, handle_usr1) r = requests.post(base_url + openshelf_path, get_all_books_payload, headers=headers) r.encoding = encoding books = PyQuery(r.text)('#shelf').find('a') book_count = len(books) print() print(str(book_count) + ' books') print("Output directory: " + output_dir) if interactive: input('Press [ENTER] to start the download') if start > 0: print("\nSkipping first %.2f %%..." % start) i = 0 for book in books.items(): if stop: stop_program() i += 1 percent = (i * 100 / book_count) if percent <= start: continue if percent > end_percent: print("Stopping at %.2f %% ..." % end_percent) break book_id = book.attr['data-id'] title = book('h1').text().replace('/', '-') current_path = os.path.join(output_dir, book_id) Path(current_path).mkdir(parents=True, exist_ok=True) print('\n\nDownloading book "' + book_id + "\" (%.2f %%)" % percent) if generate_pdf: if os.path.isfile(os.path.join(current_path, title + '.pdf')): print('Found existing PDF skipping...') continue else: if os.path.isfile(os.path.join(current_path, 'generate-pdf.sh')): print('Found PDF generation script, skipping...') continue # Writing info about book with open(os.path.join(current_path, 'info.txt'), 'w', encoding=encoding) as f: f.writelines( os.linesep.join([ u"Thumbnail: %s" % str(book('img').attr['src']), u"Title: %s" % str(book('h1').text()), u"Publisher: %s" % str(book('h2').text()) ])) if stop: stop_program() count = 0 end = 1 orig_book_id = book_id while count < end: book_id = orig_book_id count += 1 try: cookie_request = send_form( send_form( requests.get(base_url + token_path + book_id, headers=headers))) cookie_str = '' for cookie in cookie_request.cookies: if cookie.name == target_cookie_name: cookie_str = cookie.name + '=' + cookie.value + '; ' if len(cookie_str) < 1: end = handle_error('ERROR: Cookie not found!', count, end) continue if stop: stop_program() location = cookie_request.headers['Location'] if len(location) < 1: location = book_base_url + book_path + book_id + '/' print( 'WARNING: Can\'t find book location in header, assuming ' + location) r = requests.get(location, headers=headers, cookies=cookie_request.cookies, allow_redirects=False) r.encoding = encoding if r.status_code == 200: if 'IDRViewer' not in r.text and '<div id="content">' in r.text: print('Found extra content!') print('Downloading extra content...') book_content = PyQuery(r.text)("#content") extra_path = os.path.join(current_path, 'extra') extra_books = [] Path(extra_path).mkdir(parents=True, exist_ok=True) # Download root files for node in book_content( 'a:not(.sub):not(.directory)').items(): if not str(node.attr['href']).startswith('1/'): thumbnail_location = str( node('img').attr['src']) if thumbnail_location.endswith( 'thumbnails/1.jpg' ) and not thumbnail_location.startswith( 'http'): extra_books.append([ str(node.attr['href']).replace( '/index.html', ''), node('h1').text().replace('/', '-') ]) else: download_content(node, location, cookie_request.cookies, extra_path) if stop: stop_program() # Download content of all root directories for root_dir_node in book_content( 'a:not(.sub).directory').items(): root_dir = os.path.join( extra_path, root_dir_node('h1').text().replace('/', '-')) Path(root_dir).mkdir(parents=True, exist_ok=True) download_content_from_directory( book_content, root_dir_node.attr['id'], location, cookie_request.cookies, root_dir) print('Checking book_id ' + str(book_id) + '/1...') r = requests.get(location + "1/", headers=headers, cookies=cookie_request.cookies) if 'IDRViewer' not in r.text: print( 'WARNING: Book "' + book_id + '/1" looks weird (contains no "IDRViewer"! Checking extra books...' ) if len(extra_books) == 1: # COMPATIBILITY if os.path.exists( os.path.join(extra_path, extra_books[0][1])): print( 'INFO: For compatibility reasons, normal book download will be skipped!' ) book_id = None else: book_id = orig_book_id + '/' + extra_books[ 0][0] print( 'Found one extra book, setting book_id to ' + str(book_id)) r = requests.get( location + extra_books[0][0] + '/', headers=headers, cookies=cookie_request.cookies) extra_books.pop(0) elif len(extra_books) > 1: print( 'WARNING: Found more than one extra book, skipping normal book download!' ) book_id = None else: print( 'WARNING: Found no extra book, skipping normal book download!' ) book_id = None else: book_id += '/1' print('Setting book_id to ' + str(book_id)) r = requests.get(location + '1/', headers=headers, cookies=cookie_request.cookies) for extra_book in extra_books: if stop: stop_program() print('Downloading extra book "' + extra_book[0] + '"...') er = requests.get(location + extra_book[0] + '/', headers=headers, cookies=cookie_request.cookies) if 'IDRViewer' not in er.text: print( 'WARNING: Extra book "' + extra_book[0] + '" looks weird (contains no "IDRViewer"! Skipping...' ) continue if download_book( extra_book[1], orig_book_id + '/' + extra_book[0], cookie_str, os.path.join(extra_path, extra_book[1]), er).returncode != 0: end = handle_error( 'ERROR: Error running digiRipper!', count, end) raise ConnectionError( 'Error downloading extra book "' + extra_book[0] + '"!') print('Downloaded extra content to "' + extra_path + '"') else: print( 'WARNING: Got wrong response code from book page, skipping check for extra material!' ) if stop: stop_program() if book_id is not None: if download_book(title, book_id, cookie_str, current_path, r).returncode != 0: end = handle_error('ERROR: Error running digiRipper!', count, end) continue else: Path(os.path.join(current_path, 'generate-pdf.sh')).touch() except Exception as e: end = handle_error('ERROR: An exception was thrown: ' + str(e), count, end) continue
def detail(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #下架 if status_code == 404: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #其他错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) #前期准备 area = pqhtml('.product-details') pdata = self.get_pdata(pqhtml) # print json.dumps(pdata) print area.outerHtml().encode('utf-8') # exit() #下架 # isNoSize,官网配置无size选项,shippingRestrictionsVisible,官网配置限制配送商品. # pdata['shippingRestrictions']['shippingRestrictionsVisible']: # 从pdata中读取数据,下架了即都是库存为0 detail = dict() #品牌 brand = pdata['brandName'] detail['brand'] = brand #名称 detail['name'] = pdata['name'] #货币 currency = pdata['price']['currency'] detail['currency'] = currency detail['currencySymbol'] = tool.get_unit(currency) #价格 price, listPrice = pdata['price']['current'], ( pdata['price']['rrp'] or pdata['price']['previous']) detail['price'] = price detail['listPrice'] = listPrice or price #颜色 color, colorId, img = self.get_color(pdata) detail['color'] = color detail['colorId'] = colorId #图片集,每个加参数,宽度1000(大图) imgs = map( lambda x: x + '?wid=1000', filter(lambda x: x, [Dic['url'] for Dic in pdata['images']])) detail['img'] = img detail['imgs'] = imgs #产品ID productId = pdata['id'] detail['productId'] = productId #规格 detail['sizes'] = self.get_sizes(pdata) #描述 detail['descr'] = area('.product-description').text() #详细 detail['detail'] = area('.product-details').text() #品牌描述 detail['brandDescr'] = area('.brand-description').text() #HTTP状态码 detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE #返回链接 detail['backUrl'] = resp.url #返回的IP和端口 if resp.raw._original_response.peer: detail['ip_port'] = ':'.join( map(lambda x: str(x), resp.raw._original_response.peer)) log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: try: return self.detail_old(url) except Exception, e: raise
def downLoad(href): r = requests.get(href, getHeaders()) r.encoding = 'utf8' q = PyQuery(r.text) #该链接下的图片总数 total = int(q('#page > a:nth-child(9)').text()) #该链接下的标题,用来创建文件夹 title = q('body > div.main > div.article > h2').text() href = q('div.content > a> img').attr.src[0:-5] dirName = u"【{}P】{}".format(total, title) # 定义要创建的目录 mkpath = "c:\\mymeizi\\" + dirName + "\\" # print mkpath # 调用函数 # print href if makedir(mkpath): print mkpath + "目录已创建" for x in range(1, total + 1): try: imgurl = href + str(x) + ".jpg" # urllib.urlretrieve(imgrul, mkpath + "/%s.jpg" % x) # opener = urllib2.build_opener() # opener.addheaders = [('Host','img.mmjpg.com'), # ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' # 'Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400'), # ('Referer','http://www.mmjpg.com/mm')] # urllib2.install_opener(opener) # urllib2.urlretrieve(imgurl, mkpath + "/%s.jpg" % x) # os.chdir(r"D:") # header = { # 'Host': 'img.mmjpg.com', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) ' # 'Chrome/59.0.3071.115 Safari/537.36', # 'Referer': 'http://www.mmjpg.com/mm/1188/3' # } # request = urllib2.Request(imgurl, None, header) # response = urllib2.urlopen(request) # f = open(name, 'wb') # f.write(response.read()) # f.close() # print(imgurl) # print href + str(x) + ".jpg" # print imgurl headers = { 'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, * / *;q = 0.8', 'Accept - Encoding': 'gzip, deflate, sdch', 'Accept - Language': 'zh - CN, zh;q = 0.8', 'Cache - Control': 'max - age = 0', 'Connection': 'keep - alive', 'DNT': '1', 'Host': 'img.mmjpg.com', 'Referer': 'http: // www.mmjpg.com', 'Upgrade - Insecure - Requests': '1', 'User - Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 53.0.2785.104 Safari / 537.36 Core / 1.53.3427.400 QQBrowser / 9.6.12513.400', } print imgurl r = requests.get(imgurl, headers=headers) print r.status_code with open(mkpath + "/%s.jpg" % x, "wb") as code: code.write(r.content) # data = urllib.urlopen(imgurl).read() # f = file(mkpath + "/%s.jpg" % x, "wb") # f.write(data) # f.close() except Exception, e: print "出了一点小小的错误" continue
def detail_old(self, url): try: resp = self.session.get(url, verify=False) status_code = resp.status_code pqhtml = PyQuery(resp.text or 'nothing') #错误 if status_code != 200: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_error(code=status_code, message=self.cfg.GET_ERR.get( 'SCERR', 'ERROR'), backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) product = self.get_product_cfg_old(pqhtml) #下架 if product is None or product['AvailableSkus'] == 0: log_info = json.dumps( dict(time=time.time(), title=pqhtml('title').text(), url=url)) self.logger.info(log_info) data = tool.get_off_shelf(code=status_code, message=self.cfg.SOLD_OUT, backUrl=resp.url, html=pqhtml.outerHtml()) return tool.return_data(successful=False, data=data) price, oldPrice = self.get_all_price_old(pqhtml, product) colors_tmp = self.get_colors_old(pqhtml) detail = dict() detail['name'] = product['ProductName'] detail['brand'] = product['ProductBrand'] detail['currencySymbol'] = tool.get_unit( product['ProductCurrency']) detail['currency'] = product['ProductCurrency'] detail['descr'] = product['ProductDescription'].replace( ' ', '') detail['productId'] = product['ProductCode'] detail['price'] = price detail['listPrice'] = oldPrice detail['keys'] = [color['key'] for color in colors_tmp] detail['color'] = dict([(color['key'], color['name']) for color in colors_tmp]) detail['colorId'] = dict([(color['key'], color['value']) for color in colors_tmp]) #图片信息 imgs_tmp = self.get_imgs_old(pqhtml) detail['imgs'] = imgs_tmp detail['img'] = dict([(name, links[0]) for name, links in imgs_tmp.items()]) detail['sizes'] = self.get_size_old(pqhtml) detail['url'] = url detail['status_code'] = status_code #状态 detail['status'] = self.cfg.STATUS_SALE detail['backUrl'] = resp.url log_info = json.dumps( dict(time=time.time(), productId=detail['productId'], name=detail['name'], currency=detail['currency'], price=detail['price'], listPrice=detail['listPrice'], url=url)) self.logger.info(log_info) return tool.return_data(successful=True, data=detail) except Exception, e: raise
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): refreshCursor = '' results = [] resultsAux = [] cookieJar = cookielib.CookieJar() if hasattr(tweetCriteria, 'username') and ( tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and ( tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): tweetCriteria.username = tweetCriteria.username[1:-1] active = True cnt, i = 0, 0 while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) #Remove incomplete tweets withheld by Twitter Guidelines scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: i += 1 tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() print "tweeeeeeeeeet", tweetHTML, "----------", tweetPQ( "p.js-tweet-text"), "-----------", tweetPQ( "p.js-tweet-text").attr("class") a = [img for img in tweetPQ("p.js-tweet-text").items('img')] emojis = [] for img in a: if img.attr("class") == "Emoji Emoji--forText": emojis.append(img.attr("aria-label")) if len(a) != 0: cnt += 1 print a, "---------", cnt, i, len(emojis) usernameTweet = tweetPQ("span:first.username.u-dir b").text() txt = re.sub( r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace( '@ ', '@')) retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo tweet.emoji = emojis if (len(emojis) > 0): results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len( results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def getTweets(tweetCriteria, receiveBuffer = None, bufferLength = 100): refreshCursor = '' results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() active = True while active: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] tweets = PyQuery(json['items_html'])('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = Tweet.Tweet() usernameTweet = tweetPQ("span.username.js-action-profile-name b").text(); txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')); retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")); id = tweetPQ.attr("data-tweet-id"); permalink = tweetPQ.attr("data-permalink-path"); geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) return results
def dispatch(self): if self.annual_item_list is None: raise IndexError("未抓取到相关网页,或者抓取网页失败") if len(self.annual_item_list) <= 0: return {} dict_annual = {} for lst_annual in self.annual_item_list: url = lst_annual.get('url') if 'entinfo' in url: dict_annual['baseinfo'] = lst_annual.get('text') elif 'WebsiteInfo' in url: dict_annual['websiteinfo'] = util.json_loads( lst_annual.get('text')) elif 'subcapitalInfo' in url: dict_annual['subcapital'] = util.json_loads( lst_annual.get('text')) elif 'forinvestMentInfo' in url: dict_annual['forinvestment'] = util.json_loads( lst_annual.get('text')) elif 'GuaranteeInfo' in url: dict_annual['forguaranteeinfo'] = util.json_loads( lst_annual.get('text')) elif 'alterStockInfo' in url: dict_annual['alterstockinfo'] = util.json_loads( lst_annual.get('text')) elif 'updateinfo' in url: dict_annual['updateinfo'] = util.json_loads( lst_annual.get('text')) # 基本信息 base_info = dict_annual.get('baseinfo') if base_info is not None: info = PyQuery(base_info, parser='html').find('.encounter-info') annual_base_info = self.zj_get_annual_base_info(info) self.annual_info_dict.update(annual_base_info) # 网站或网店信息 web_info = dict_annual.get('websiteinfo') if web_info is not None: lst_websites = self.zj_get_annual_web_site_info(web_info) self.annual_info_dict[AnnualReports.WEBSITES] = lst_websites # 股东出资信息 share_hold_info = dict_annual.get('subcapital') if share_hold_info is not None: lst_share_hold = self.zj_get_annual_share_hold_info( share_hold_info) self.annual_info_dict[ AnnualReports.SHAREHOLDER_INFORMATION] = lst_share_hold # 对外投资 inv_info = dict_annual.get('forinvestment') if inv_info is not None: lst_inv = self.zj_get_annual_inv_info(inv_info) self.annual_info_dict[AnnualReports.INVESTED_COMPANIES] = lst_inv # 年报 企业资产状况信息 base_info = dict_annual.get('baseinfo') if base_info is not None: tds = PyQuery(base_info, parser='html').find('.table-zichan').not_( '.table-td-pd').find('td') asset_model = self.zj_get_annual_asset_info(tds) self.annual_info_dict[ AnnualReports. ENTERPRISE_ASSET_STATUS_INFORMATION] = asset_model # 对外担保 out_guaranty_info = dict_annual.get('forguaranteeinfo') if out_guaranty_info is not None: lst_out_guaranty = self.zj_get_annual_out_guarantee_info( out_guaranty_info) self.annual_info_dict[ AnnualReports.OUT_GUARANTEE_INFO] = lst_out_guaranty # 股权变更 edit_shareholding_change_info = dict_annual.get('alterstockinfo') if edit_shareholding_change_info is not None: lst_edit_shareholding_change = self.zj_get_annual_edit_shareholding_change( edit_shareholding_change_info) self.annual_info_dict[ AnnualReports. EDIT_SHAREHOLDING_CHANGE_INFOS] = lst_edit_shareholding_change # 修改记录 edit_change_info = dict_annual.get('updateinfo') if edit_change_info is not None: lst_edit_change = self.zj_get_annual_edit_change(edit_change_info) self.annual_info_dict[ AnnualReports.EDIT_CHANGE_INFOS] = lst_edit_change
def collectTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None, debug=False): results = [] resultsAux = [] cookieJar = http.cookiejar.CookieJar() user_agent = random.choice(TweetManager.user_agents) all_usernames = [] usernames_per_batch = 20 if hasattr(tweetCriteria, 'username'): if type(tweetCriteria.username) == str or not hasattr( tweetCriteria.username, '__iter__'): tweetCriteria.username = [tweetCriteria.username] usernames_ = [u.lstrip('@') for u in tweetCriteria.username if u] all_usernames = sorted({u.lower() for u in usernames_ if u}) n_usernames = len(all_usernames) n_batches = n_usernames // usernames_per_batch + ( n_usernames % usernames_per_batch > 0) else: n_batches = 1 for batch in range(n_batches): refreshCursor = '' batch_count_results = 0 if all_usernames: tweetCriteria.username = all_usernames[ batch * usernames_per_batch:batch * usernames_per_batch + usernames_per_batch] active = True while active: json = TweetManager.getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, user_agent, debug=debug) if len(json['items_html'].strip()) == 0: break refreshCursor = json['min_position'] scrapedTweets = PyQuery(json['items_html']) scrapedTweets.remove('div.withheld-tweet') tweets = scrapedTweets('div.js-stream-tweet') if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() usernames = tweetPQ("span.username.u-dir b").text().split() if not len(usernames): continue tweet.username = usernames[0] tweet.to = usernames[1] if len(usernames) >= 2 else None tweet.text = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) \ .replace('# ', '#').replace('@ ', '@').replace('$ ', '$') tweet.retweets = int( tweetPQ( "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.favorites = int( tweetPQ( "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.replies = int( tweetPQ( "span.ProfileTweet-action--reply span.ProfileTweet-actionCount" ).attr("data-tweet-stat-count").replace(",", "")) tweet.id = tweetPQ.attr("data-tweet-id") tweet.permalink = 'https://twitter.com' + tweetPQ.attr( "data-permalink-path") tweet.author_id = int( tweetPQ("a.js-user-profile-link").attr("data-user-id")) dateSec = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) tweet.date = datetime.datetime.fromtimestamp( dateSec, tz=datetime.timezone.utc) tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec, tz=datetime.timezone.utc) \ .strftime("%a %b %d %X +0000 %Y") tweet.mentions = " ".join( re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join( re.compile('(#\\w*)').findall(tweet.text)) urls = [] for link in tweetPQ("a"): try: urls.append((link.attrib["data-expanded-url"])) except KeyError: pass tweet.urls = ",".join(urls) results.append(tweet) resultsAux.append(tweet) if receiveBuffer and len(resultsAux) >= bufferLength: receiveBuffer(resultsAux) resultsAux = [] batch_count_results += 1 if tweetCriteria.maxTweets > 0 and batch_count_results >= tweetCriteria.maxTweets: active = False break if receiveBuffer and len(resultsAux) > 0: receiveBuffer(resultsAux) resultsAux = [] return results
# 通过requests请求连接,并返回对应结果 import requests response = requests.get('http://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA') print(response) # 通过Pyquery及返回的response提取内容 from pyquery import PyQuery document = PyQuery(response.text) content = document('#js_content').text() # 通过stats_word的分词程序,提取词频为前100的分词,并转换为str格式 import stats_word as sw list_a = sw.stats_text(content,100) a = '' for i in range(len(list_a)) : a = a + str(list_a[i][0]) + ' ' + str(list_a[i][1]) + ', ' print(a) # 通过yagmail将内容发送给指定邮箱,并通过getpass保护密码 import getpass import yagmail sender = input('输入发件人邮箱:') password = getpass.getpass('输入发件人密码:') recipients = input('输入收件人邮箱:') # yagmail.register(sender, password) yagmail = yagmail.SMTP(sender, password, host='smtp.163.com') yagmail.send(recipients,'19100205 lihaotian007', a)
def test_theme_license_link(self): s = render('{{ license_link(lic) }}', {'lic': amo.LICENSE_COPYRIGHT}) ul = PyQuery(s)('.license') assert ul.find('.icon').length == 1 assert ul.find('.icon.copyr').length == 1 text = ul.find('.text') assert text.find('a').length == 0 assert text.text() == 'All Rights Reserved' s = render('{{ license_link(lic) }}', {'lic': amo.LICENSE_CC_BY_NC_SA}) ul = PyQuery(s)('.license') assert ul.find('.icon').length == 3 assert ul.find('.icon.cc-attrib').length == 1 assert ul.find('.icon.cc-noncom').length == 1 assert ul.find('.icon.cc-share').length == 1 link = ul.find('.text a') assert link.find('a').length == 0 assert link.text() == 'Some rights reserved' assert link.attr('href') == amo.LICENSE_CC_BY_NC_SA.url
def _slice(res: str, index: int = 1) -> GoogleResponse: utf8_parser = HTMLParser(encoding="utf-8") d = PyQuery(fromstring(res, parser=utf8_parser)) data = d.find(".g") pages = list(d.find("td").items())[1:-1] return GoogleResponse(data, pages, index)
def get_xml_shareholder_info(self, share_xml): shareholder_info_dict = {} str_index = share_xml.find('<data>') if str_index < 0: return shareholder_info_dict shareholder_data = PyQuery( share_xml, parser='xml').find('data').find('data').items() lst_shareholder = [] for data in shareholder_data: share_model = { GsModel.ShareholderInformation.SHAREHOLDER_NAME: data.find('inv').text(), GsModel.ShareholderInformation.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit(data.find('liSubConAm').text()), GsModel.ShareholderInformation.PAIED_AMOUNT: util.get_amount_with_unit(data.find('liAcConAm').text()), } lst_sub = [] sub_data = data.find('imInvprodetailList').find( 'imInvprodetailList').items() for sub_detail in sub_data: sub_dict = { GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TYPE: sub_detail.find('conFormCN').text(), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT: util.get_amount_with_unit( sub_detail.find('subConAm').text()), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_TIME: sub_detail.find('conDate').text(), GsModel.ShareholderInformation.SubscriptionDetail.SUBSCRIPTION_PUBLISH_TIME: sub_detail.find('publicDate').text() } sub_dict = self.replace_none(sub_dict) lst_sub.append(sub_dict) share_model[ GsModel.ShareholderInformation.SUBSCRIPTION_DETAIL] = lst_sub lst_paid = [] paid_data = data.find('imInvactdetailList').find( 'imInvactdetailList').items() for paid_detail in paid_data: paid_dict = { GsModel.ShareholderInformation.PaiedDetail.PAIED_TYPE: paid_detail.find('acConFormCn').text(), GsModel.ShareholderInformation.PaiedDetail.PAIED_AMOUNT: util.get_amount_with_unit( paid_detail.find('acConAm').text()), GsModel.ShareholderInformation.PaiedDetail.PAIED_TIME: paid_detail.find('conDate').text(), GsModel.ShareholderInformation.PaiedDetail.PAIED_PUBLISH_TIME: paid_detail.find('publicDate').text(), } paid_dict = self.replace_none(paid_dict) lst_paid.append(paid_dict) share_model[GsModel.ShareholderInformation.PAIED_DETAIL] = lst_paid share_model = self.replace_none(share_model) lst_shareholder.append(share_model) shareholder_info_dict[ GsModel.SHAREHOLDER_INFORMATION] = lst_shareholder return shareholder_info_dict
import requests from pyquery import PyQuery import pickle from helper.move import * moves = [] for x in range(1, 8): data = requests.get("https://pokemondb.net/move/generation/" + str(x)) src = PyQuery(data.text) trs = src.find('.ent-name') length = len(moves) i = length for tr in trs: moves.append([]) moves[i].append(tr.text) i += 1 trs = src.find('.type-icon') i = length for tr in trs: moves[i].append(tr.text) i += 1 trs = src.find('td:nth-child(3)') i = length for tr in trs: if tr.attrib["data-sort-value"] == "special": moves[i].append(1) elif tr.attrib["data-sort-value"] == "physical":
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'accept-language': "en-US,en;q=0.9", 'cache-control': "no-cache", 'content-type': "application/x-www-form-urlencoded", 'origin': "https://www.sgpbusiness.com", #'cookie': "__cfduid=dbd33060d5ca8c09500f853a22ccafcc11526030630; _ga=GA1.2.1937662851.1526030632; _gid=GA1.2.412763831.1526710953; sgpbizsess=a2504243407a7eb0a29ef82e447a5b0a43f37d1a; _gat=1", 'referer': "https://www.sgpbusiness.com/", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36" } f = open('sgp_out.txt','a') li = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] for i in ['b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']: for j in li: for k in li: payload = "search_val={}".format(i+j+k) response = requests.request("POST", url, data=payload, headers=headers) pq = PyQuery(response.text) links = pq('.list-group a') print (i+j+k+' - '+str(len(links))) for l in links: #print (l.attrib['href']) f.write(l.attrib['href']+'\n') f.flush() #time.sleep(2)
def get_brand(code): q = PyQuery("https://kabutan.jp/stock/?code=7203") sector = q.find('#stockinfo_i2 > div > a')[0].text print(sector) print(code)
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('link'): e = PyQuery(element) href = e.attr('href') if href: if href.find(domain) > -1: new_href = href.split(domain)[-1] new_href = '{}{}'.format(target_domain, new_href) e.attr('href', new_href) print "\t", "fixed link ", href, "=> ", new_href for element in d('a'): e = PyQuery(element) href = e.attr('href') if href: if href.find(domain) > -1: new_href = href.split(domain)[-1] e.attr('href', new_href) print "\t", "Fixed ", href, "=> ", new_href if href and not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
from pyquery import PyQuery doc = PyQuery('<html>Hello</html>') res = doc('html').text() print(doc) print(res)
def get_content(self): url = reverse('addons.versions', args=[self.addon.slug]) return PyQuery(self.client.get(url).content)
def test_eula_with_contrib_roadblock(self): url = reverse('addons.eula', args=[11730, 53612]) response = self.client.get(url, follow=True) doc = PyQuery(response.content) assert doc('[data-search]').attr('class') == 'install '
# coding=utf8 from pyquery import PyQuery import requests import sys import yagmail import getpass sys.path.append(r"C:\Study\Programming\Python\Python_Data_Analysis") from d11.stats_word import stats_text content_url = "https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA" html_code = requests.get(content_url).text document = PyQuery(html_code) content = document("#js_content").text().replace("\n", "") try: en_result, cn_result = stats_text("", content) # print(cn_result) smtp_host = "smtp.sina.com" sender = input("Please enter the sender's email address: ") password = getpass.getpass("Please enter the sender's email password: "******"Please enter the recipient's email address: ") yagmail.SMTP(user=sender, password=password, host=smtp_host).send(recipient, "Cutted words", str(cn_result)) except ValueError as e: print("Exception catched.") print(e)
def index_page(self): r = self.client.get("/") pq = PyQuery(r.content) link_elements = pq(".toctree-wrapper a.internal") self.toc_urls = [l.attrib["href"] for l in link_elements]
def render(self, **kwargs): return PyQuery(install_button(self.context, self.addon, **kwargs))
#!/usr/bin/env python3 # date: 2019.08.12 # https://stackoverflow.com/questions/57454154/pyquery-wont-return-elements-on-a-page # https://github.com/gawel/pyquery/issues/199 import requests from pyquery import PyQuery url = "http://www.floridaleagueofcities.com/widgets/cityofficials?CityID=101" page = requests.get(url) pq = PyQuery(page.text, parser="html") for item in pq('li p'): print(item.text)
def load_page(self, url=None): url = random.choice(self.toc_urls) r = self.client.get(url) pq = PyQuery(r.content) link_elements = pq("a.internal") self.urls_on_current_page = [l.attrib["href"] for l in link_elements]