def test_get_list_identities(self, client): res = client.get("/identity") html = document_fromstring(res.get_data()) header_exists = html.xpath("boolean(//tr/th)") self.assertTrue(header_exists) rows = html.xpath("boolean(//tr/td)") self.assertFalse(rows) joe = Identity() joe.save() john = Identity() john.save() jim = Identity() jim.save() res = client.get("/identity") html = document_fromstring(res.get_data()) rows = html.xpath("boolean(//tr/td)") self.assertTrue(rows) ids = html.xpath("//tr/td[1]//text()") self.assertIn(str(joe.id), ids)
def get_url_html(url, cache=True, cookiejar=None): print("GET :: ", url) if cache: data = get_from_cache(url) if data: return lh.document_fromstring(data) attempts = 0 while True: try: response = urlopen(url); html = urlopen(url).read().decode('utf8') save_to_cache(url, html) return lh.document_fromstring(html) except: attempts = attempts + 1 if attempts > MAX_ATTEMPTS: raise print( "Http error retry in 1 second") time.sleep(1) # pause 5 seconds continue
def parse_detail(self, page, list_record): t = document_fromstring(page) record = {} for tr in t.xpath("//td[@class='displayvalue']/parent::*"): key = tr[1].text_content() or '' value = tr[2].text_content() or '' record[key.strip()] = value.strip() # If there's no filing date, this detail page is related to another # license. Go get the dates from that page. if not record.has_key('Filing Date:'): a = t.xpath("//div[@class='instructions']//a")[0] page = self.get_html('http://www.trans.abc.state.ny.us' + a.get('href')) t = document_fromstring(page) parent_record = {} for tr in t.xpath("//td[@class='displayvalue']/parent::*"): key = tr[1].text_content() or '' value = tr[2].text_content() or '' parent_record[key.strip()] = value.strip() dates = { 'Filing Date:': parent_record['Filing Date:'], 'Effective Date:': parent_record['Effective Date:'], 'Expiration Date:': parent_record['Expiration Date:'], } record.update(dates) return record
def getCollectionFics(url): try: wrlog('Начинаю скачивать фанфики со сборника %s' % url) # Все фанфики с одного зборника fics = [] # будущий список фанфиков counter = 1 # счетчик страниц r = req.post(url, cookies = cookies) doc = html.document_fromstring(r.text) if checkColecttionIsOpen(doc): while True: r = req.get(url + "?sort=author&p=" + str(counter), cookies=cookies) doc = html.document_fromstring(r.text) # Если на странице есть блок с фиками. # Это обусловленно тем, что фикбук не выдает 404 если загрузить страницу под несоществуючим номером # Она просто будет пустой if not doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div/table/tr/td[1]/table/tr[1]/td/*'): break # Список ссылок на фанфики с текущей страницы cur_page_fics = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div/table/tr/td[1]/table/tr[1]/td/a/@href') for fic in cur_page_fics: fics.append("http://ficbook.net" + fic) counter += 1 else: wrlog("Закрытый сборник. URL: %s" % url) QtGui.QMessageBox.information(None, 'Ошибка', 'Сборник является приватным. Если это ваш сборник, или вы имеете к нему доступ -- авторизируйтесь.') return return fics except Exception as e: showErrorMessage(e)
def data_retrieval(self, datasets): """ Retrieve a list of datasets form the ESO archive. Parameters ---------- datasets : list of strings List of datasets strings to retrieve from the archive. Returns ------- files : list of strings List of files that have been locally downloaded from the archive. """ from lxml import html datasets_to_download = [] files = [] # First: Detect datasets already downloaded for dataset in datasets: local_filename = dataset + ".fits" if self.cache_location is not None: local_filename = os.path.join(self.cache_location, local_filename) if os.path.exists(local_filename): print("Found {0}.fits...".format(dataset)) files += [local_filename] elif os.path.exists(local_filename + ".Z"): print("Found {0}.fits.Z...".format(dataset)) files += [local_filename + ".Z"] else: datasets_to_download += [dataset] # Second: Download the other datasets if datasets_to_download: data_retrieval_form = self.request("GET", "http://archive.eso.org/cms/eso-data/eso-data-direct-retrieval.html") print("Staging request...") with suspend_cache(self): # Never cache staging operations data_confirmation_form = self._activate_form(data_retrieval_form, form_index=-1, inputs={"list_of_datasets": "\n".join(datasets_to_download)}) root = html.document_fromstring(data_confirmation_form.content) login_button = root.xpath('//input[@value="LOGIN"]') if login_button: raise LoginError("Not logged in. You must be logged in to download data.") # TODO: There may be another screen for Not Authorized; that should be included too data_download_form = self._activate_form(data_confirmation_form, form_index=-1) root = html.document_fromstring(data_download_form.content) state = root.xpath("//span[@id='requestState']")[0].text while state != 'COMPLETE': time.sleep(2.0) data_download_form = self.request("GET", data_download_form.url) root = html.document_fromstring(data_download_form.content) state = root.xpath("//span[@id='requestState']")[0].text print("Downloading files...") for fileId in root.xpath("//input[@name='fileId']"): fileLink = fileId.attrib['value'].split()[1] fileLink = fileLink.replace("/api", "").replace("https://", "http://") filename = self.request("GET", fileLink, save=True) files += [system_tools.gunzip(filename)] print("Done!") return files
def run_parse(): page = urllib.request.urlopen(base_url) doc = html.document_fromstring(page.read()) doc.make_links_absolute(base_url=base_url) for link in html.iterlinks(doc): if ("forumdisplay.php" in link[2]) and ("f=43" in link[2]): v_chapter_name = link[0].text_content() v_path = folder_prefix + v_chapter_name v_link = link[2] # Создаем папку if not os.path.exists(v_path): os.makedirs(v_path) page = urllib.request.urlopen(v_link) doc = html.document_fromstring(page.read()) doc.make_links_absolute(base_url=base_url) for link_topics in html.iterlinks(doc): parsed_url = urllib.parse.urlparse(link_topics[2]) # print(parsed_url) parsed_q = urllib.parse.parse_qs(parsed_url.query) # print(parsed_q) # Отыскиваем ссылку на первую страницу if (parsed_url.path == "/showthread.php") and ("t" in parsed_q) and not ("page" in parsed_q) and \ (link_topics[0].text_content() != "1")\ and (parsed_q["t"][0] == "1537" ): parse_topic(link_topics, v_path) #print(parsed_q)
def parse_topic(page_link, v_base_path): v_topic_name = page_link[0].text_content() log(v_topic_name + "---" + page_link[2]) logging.log(logging.INFO, v_topic_name + "---" + page_link[2]) v_full_path = v_base_path + "\\" + slugify(v_topic_name) v_url = page_link[2] # Создаем папку #if not os.path.exists(v_full_path): # os.makedirs(v_full_path) #TODO Временно if True: # Ищем все посты page = urllib.request.urlopen(v_url) doc = html.document_fromstring(page.read()) menu_controls = doc.cssselect('td[class="vbmenu_control"]') #TODO: Временно! #parse_topic_page(doc, v_full_path) for menu_control in menu_controls: v_page_text = menu_control.text_content() if re.match(r'Страница \d из \d', v_page_text): page_count = int(re.sub(r'(Страница \d из )(\d)', r'\2', v_page_text)) for i in range(page_count-1): #TODO: Временно! if (i+2>=32): url = "{}&page={}".format(page_link[2],i+2) log(v_topic_name + "---" + url) page = urllib.request.urlopen(url) doc = html.document_fromstring(page.read()) parse_topic_page(doc, v_full_path) break else: log("{} already exists. Skipped".format(v_full_path))
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True): """Parse an HTML text. Return value: lxml.html.HtmlElement document. parser: which parser to use. whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML.""" doc = None if parser == scraper.LXML_HTML: if whole_doc: doc = html.document_fromstring(text) else: doc = html.fromstring(text) elif parser == scraper.HTML5PARSER: # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642 #if whole_doc: # doc = html5parser.document_fromstring(text) #else: # doc = html5parser.fromstring(text) # Here is my workaround: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) etree_doc = parser.parse(text) # returns an ElementTree doc = html.document_fromstring(elementtree_to_string(etree_doc)) # ^ this double conversion makes it slow ^ elif parser == scraper.BEAUTIFULSOUP: # soupparser has no document_fromstring method doc = soupparser.fromstring(text) else: print >>sys.stderr, "Warning: you want to use an unknown parser in lx.py." # doc is None return doc # lxml.html.HtmlElement
def scrape_wiki_codes(): data = {} base_url = 'http://en.wikipedia.org/wiki/List_of_ISO_639' #639-1 resp = web.get(base_url + '-1_codes') h = html.document_fromstring(resp) table = h.find_class('wikitable')[0] for row in table.findall('tr')[1:]: name = row.findall('td')[2].find('a').text code = row.findall('td')[4].text data[code] = name #639-2 resp = web.get(base_url + '-2_codes') h = html.document_fromstring(resp) table = h.find_class('wikitable')[0] for row in table.findall('tr')[1:]: name = row.findall('td')[3].find('a') if name: name = name.text else: continue code_list = row.findall('td')[0].text.split(' ') if len(code_list) == 1: code = code_list[0] else: for i in code_list: if '*' in i: code = i.replace('*', '') break data[code] = name return data
def scrape_balance(username, password): session = requests.Session() login_response = session.get(LOGIN_URL) login_page = html.document_fromstring(login_response.content) login_page.make_links_absolute(LOGIN_URL) login_form = login_page.forms[0] login_form_data = dict(login_form.form_values()) login_form_data['username'] = username login_form_data['password'] = password logged_in_response = session.request( login_form.method, login_form.action, data=login_form_data, ) logged_in_page = html.document_fromstring(logged_in_response.content) if logged_in_page.cssselect('form.login-form'): raise InvalidCredentialsException() points_elem = logged_in_page.cssselect('.points') if not points_elem: # We don't know what went wrong here. raise RuntimeError() points_text = ''.join(points_elem[0].itertext()) points_match = RE_POINTS.search(points_text) if not points_match: # We don't know what went wrong here either. raise RuntimeError() return int(points_match.group('points'))
def consume(self): while not self.__stop: try: data = self.__fetcher.fetch() if data=="DIE": self.logger.info("收到死亡信号了") self.pusher.push("DIE") self.stop() continue # import pdb # pdb.set_trace() url = BASEURL+data self.logger.info("正在访问 : %s" % url) time.sleep(1) self.__browser.visit(url) page = html.document_fromstring(self.__browser.html) top50 = [BASEURL+link.get('href')+"/about" for link in page.xpath(u"//h3[@class='zm-item-answer-author-wrap']/a[@class='zm-item-link-avatar']")] # 代码需要修改, 这里应该有一个top[] 长度的判断, for i in range(5): time.sleep(2) self.logger.info("正在访问 : %s" % top50[i]) self.__browser.visit(top50[i]) self.__pusher.push(html.document_fromstring(self.__browser.html)) except QueueEmpty as e: time.sleep(5) self.logger.error("队列为空") except Exception as ex: # 调用相关的处理 ,调用相关的组建 self.logger.error("严重异常 :%s" % traceback.format_exc()) raise ex
def parse_html(self, in_html=False): """ Parses the HTML document imported. Currently, BeautifulSoup is used. :param in_html: HTML file to override the one given by the super class :return: the parsed content # Alternative used etree HTMLParser, but this requires an extra # two calls, one making it a StringIO, and then acquiring the root # element tree, but I don't see a difference? # parser = etree.HTMLParser() # tree = etree.parse(StringIO(html), parser) # # return tree.getroot() """ if not in_html: parsed_html = document_fromstring(self.raw_html) self.parsed_html = parsed_html else: parsed_html = document_fromstring(in_html) logger.debug('Parsed HTML. {0}'.format(parsed_html)) return parsed_html
def _do_fetch_all(self): logging.debug("Fetching all data from : %s" % self["url"]) response = self.http_get(self["url"]) tree = HTMLNode(html.document_fromstring(response.text)) self["files"] = [] files_div = tree.find("div", **{"id": "files"}) if len(files_div) == 1: files_table = files_div[0].find("table") if len(files_table) == 1: for tr in files_table[0].find("tr")[1:]: filename, size = tr.find("td") self["files"].append(ResultFile(**{ "filename": filename.getContent().decode("utf-8").strip(), "size": self._plugin.parse_size(size.getContent().decode("utf-8").replace(",", "").strip()) })) logging.debug("Fetching all data from : %s" % self["download_link_page"]) response = self.http_get(self["download_link_page"]) tree = HTMLNode(html.document_fromstring(response.text)) download_link =[a for a in tree.find("a") if a.prop("href") is not None and a.prop("href").startswith("download.php")][0] self["download_link"] = urllib.parse.urljoin(self["download_link_page"], download_link.prop("href"))
def _get_works(self): req_opere = requests.get(self.scheda.link_opere) if req_opere.ok: doc = html.document_fromstring(req_opere.text) risultati = doc.xpath("//*[@id='corpo_opac']/div[1]/div[2]/div[2]/div[1]")[0].text_content().strip() resmatch = RISULTATI.match(risultati) if resmatch: res_start = int(resmatch.group(1)) res_stop = int(resmatch.group(2)) res_tot = int(resmatch.group(3)) url = 'http://opac.sbn.it/opacsbn/opaclib' \ '?db=solr_iccu&resultForward=opac/iccu/brief.jsp&from=1&nentries={res_tot}' \ '&searchForm=opac/iccu/error.jsp&do_cmd=search_show_cmd&item:5032:BID={code}'.format( res_tot=res_tot, code='IT\\ICCU\\'+self.code) req_opere_tot = requests.get(url) if req_opere_tot.ok: doc = html.document_fromstring(req_opere_tot.text) topere = doc.xpath("//div[@id='colonna_risultati']/table[@id='records']/tbody")[0].getchildren() topere = [row.getchildren()[3].getchildren() for row in topere] for op in topere: opera = self.Work() for div in op: if div.get('class') == 'rectitolo': opera.url = self.BASE_SBN_URL + div.getchildren()[0].get('href') opera.titolo = div.getchildren()[0].text.strip() elif div.get('class') == 'rec_3a_linea': sourceline = div.sourceline raw_source = req_opere_tot.text.split('\n')[sourceline-1:sourceline][0] opera.edizione = remove_tags(remove_br_tags(raw_source, '\n')) else: opera.autori = div.text self.opere.append(opera)
def _getFavorites(self): """ Returns dict by name of topic_id :param username: string of username, ex. 'some_user' :return: dict(name) = id """ url = self._genFavoritesUrlByUser(self._username) doc = html.document_fromstring(requests.get(url).text) out = dict() pages = get_pages(doc) favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']") for f in favs: # out[f.text] = str(f.attrib['href']).split('/')[-2] # topic_id = out[f.text] = str(f.attrib['href']).split('/')[-2] for p in range(2, pages): url = 'http://{0}/users/{1}/favorites/page{2}/'.format(self._domain, self._username, p) # if show_progress: # print('parsing page{0}... url={1}'.format(p, url)) doc = html.document_fromstring(requests.get(url).text) favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']") for f in favs: # out[f.text] = f.attrib['href'][-7:-1] out[f.text] = str(f.attrib['href']).split('/')[-2] return out
def get_tree(self): """ Return the DOM for the article content. Note this actually returns the XPATH method on the tree, so you can do: a.tree(<xpath>) directly. """ quoted_url = urllib.quote(self.url, safe='') html_file = CACHE.child(quoted_url) self.log.info(self.url) if not html_file.exists(): self.log.debug(" Downloading") response, self.content = HTTP.request(self.url) status_code = int(response['status']) if not (200 <= status_code < 400): self.log.error("Got HTTP status code %d" % status_code) # cache content with open(html_file, 'w') as fp: fp.write(self.content) return document_fromstring(self.content).xpath else: self.log.debug(" Using cache ('%s...')" % html_file.name[:60]) with open(html_file) as fp: self.content = fp.read() return document_fromstring(self.content).xpath
def data_retrieval(self, datasets): """ Retrieve a list of datasets form the ESO archive. Parameters ---------- datasets : list of strings List of datasets strings to retrieve from the archive. Returns ------- files : list of strings List of files that have been locally downloaded from the archive. """ data_retrieval_form = self.session.get("http://archive.eso.org/cms/eso-data/eso-data-direct-retrieval.html") data_confirmation_form = self._activate_form(data_retrieval_form, form_index=-1, inputs={"list_of_datasets": "\n".join(datasets)}) data_download_form = self._activate_form(data_confirmation_form, form_index=-1) root = html.document_fromstring(data_download_form.content) state = root.xpath("//span[@id='requestState']")[0].text while state != 'COMPLETE': time.sleep(2.0) data_download_form = self.session.get(data_download_form.url) root = html.document_fromstring(data_download_form.content) state = root.xpath("//span[@id='requestState']")[0].text files = [] for fileId in root.xpath("//input[@name='fileId']"): fileLink = fileId.attrib['value'].split()[1] fileLink = fileLink.replace("/api","").replace("https://","http://") files += [self._download_file(fileLink)] print("Done!") return files
def wrapper(*args, **kw): try: doc = lhtml.document_fromstring(get(url, cache=True, **kw)) return fn(doc, *args, **kw) except Exception: write_cache(url, None) doc = lhtml.document_fromstring(get(url, cache=True, **kw)) return fn(doc, *args, **kw)
def scrape_balance(username, password): session = requests.Session() login_response = session.get(LOGIN_URL) login_page = html.document_fromstring(login_response.content) login_page.make_links_absolute(LOGIN_URL) login_form_data = { 'catalogId': '10051', 'reLogonURL': 'MSSparksLandingPage', 'myAcctMain': '', 'fromOrderId': '*', 'toOrderId': '.', 'deleteIfEmpty': '*', 'continue': '1', 'createIfEmpty': '1', 'calculationUsageId': '-1', 'updatePrices': '0', 'errorViewName': 'MSSparksLandingPage', 'forgotPasswordURL': 'MSSparksLandingPage', 'previousPage': 'logon', 'rememberMe': 'true', 'resetConfirmationViewName': 'ResetPasswordForm', 'URL': '/MSNorth', 'logonId': username, 'logonPassword': password, } logged_in_response = session.post( LOGIN_ENDPOINT, data=login_form_data, ) # Populate the auth token cookie for the API session.get(AUTH_TOKEN_ENDPOINT) auth_token = None for cookie, value in session.cookies.items(): if cookie.startswith('MS_AUTH_TOKEN_'): auth_token = value if not auth_token: raise InvalidCredentialsException() logged_in_page = html.document_fromstring(logged_in_response.content) if logged_in_page.cssselect('form.login-form'): raise InvalidCredentialsException() offers_response = session.get(OFFERS_API, headers={ 'Authorization': 'MNSAuthToken %s' % auth_token, }) offers = offers_response.json() return int(offers['sparks'])
def domain_to_graph(fname, type="zss"): ''' a wrapper function that turns an html file to a dom graph ''' fh = open(fname, 'r') content = fh.read() fh.close() if type == "zss": html_tag = html.document_fromstring(content) return make_html_zssgraph(html_tag) if type == "nx": html_tag = html.document_fromstring(content) return make_html_nxgraph(html_tag)
def process_results_page(self, url): """ Parameters ---------- url : str URL of the lead-in results page """ r = requests.get(url) if r.status_code != 200: raise RuntimeError("Could not retrieve {}".format(url)) leadin_doc = html.document_fromstring(r.content) tables = leadin_doc.cssselect('.participant-list') # Get any following pages. links = leadin_doc.cssselect('.pagination a[rel]') while True: if len(links) == 0: break lst = [link for link in links if link.text.startswith('Next')] if len(lst) == 0: break anchor = lst[0] next_rel_url = anchor.get('href') print('\t\t{}'.format(next_rel_url)) r = requests.get('http://results.active.com' + next_rel_url) doc = html.document_fromstring(r.content) table = doc.cssselect('.participant-list')[0] tables.append(table) links = doc.cssselect('.pagination a[rel]') # Search the tables. lst = [] for table in tables: trs = table.cssselect('tr') # first row has stuff we don't want for tr in trs[1:]: tds = tr.getchildren() if len(tds) < 2: continue for regex in self.regex: if regex.match(tds[2].text_content()): lst.append(tr) if len(lst) > 0: # Ok we found some results. Insert the header for the first table. header_row = tables[0].cssselect('tr')[0] lst.insert(0, header_row) self.webify_results(leadin_doc, lst, url)
def assertTreeDiff(self, html1, html2, expected): """ Asserts that the given HTML strings will produce a tree_diff of the expected HTML string. """ # The test strings should *not* have <html> and <body> tags, for the # sake of brevity. tree1 = document_fromstring("<html><body>%s</body></html>" % html1) tree2 = document_fromstring("<html><body>%s</body></html>" % html2) expected = "<html><body>%s</body></html>" % expected result_tree = tree_diff(preprocess(tree1), preprocess(tree2), self.algorithm) got = etree.tostring(result_tree) self.assertEqual(got, expected)
def vote_ids_for_house(congress, session_year, options): vote_ids = [] index_page = "http://clerk.house.gov/evs/%s/index.asp" % session_year group_page = r"ROLL_(\d+)\.asp" link_pattern = r"http://clerk.house.gov/cgi-bin/vote.asp\?year=%s&rollnumber=(\d+)" % session_year # download index page, find the matching links to the paged listing of votes page = utils.download( index_page, "%s/votes/%s/pages/house.html" % (congress, session_year), options) if not page: logging.error("Couldn't download House vote index page, aborting") return None # extract matching links doc = html.document_fromstring(page) links = doc.xpath( "//a[re:match(@href, '%s')]" % group_page, namespaces={"re": "http://exslt.org/regular-expressions"}) for link in links: # get some identifier for this inside page for caching grp = re.match(group_page, link.get("href")).group(1) # download inside page, find the matching links page = utils.download( urlparse.urljoin(index_page, link.get("href")), "%s/votes/%s/pages/house_%s.html" % (congress, session_year, grp), options) if not page: logging.error("Couldn't download House vote group page (%s), aborting" % grp) continue doc = html.document_fromstring(page) votelinks = doc.xpath( "//a[re:match(@href, '%s')]" % link_pattern, namespaces={"re": "http://exslt.org/regular-expressions"}) for votelink in votelinks: num = re.match(link_pattern, votelink.get("href")).group(1) vote_id = "h" + num + "-" + str(congress) + "." + session_year if not should_process(vote_id, options): continue vote_ids.append(vote_id) return utils.uniq(vote_ids)
def main(): source = urllib.urlopen(_URL).read() tree = html.document_fromstring(source) for img in get_image_top_news(tree): print img url = get_text_top_news(tree) print "\n", url detail_tree = html.document_fromstring(urllib.urlopen(url).read()) title, snippets = get_complete_page(detail_tree) print "\n", title for snippet in snippets: print "\n", snippet
def assertStrips(self, html1, html2, expected, num_removals, check_ids=False): """ Asserts that strip_template(html1, html2) will result in the expected HTML string, and that the return value is num_removals. """ # The test strings should *not* have <html> and <body> tags, for the # sake of brevity. tree1 = document_fromstring('<html><body>%s</body></html>' % html1) tree2 = document_fromstring('<html><body>%s</body></html>' % html2) expected = '<html><body>%s</body></html>' % expected got_removals = strip_template(tree1, tree2, check_ids=check_ids) got_tree = etree.tostring(tree1, method='html') self.assertEqual(got_tree, expected) self.assertEqual(got_removals, num_removals)
def _parse_data(self): # Find the javascript and get the content try: text = self.url_response.text root = HTML.document_fromstring(text) js_list = root.xpath('head/script/@src') js_url = self.BASE_URL + filter(lambda x:x[-3:]=='.js', js_list)[0] except IndexError: return QueryResult(False, err='javascript file is not found') r = retry_requests(js_url) if r.ok is False: return QueryResult(False, err=r.error_msg) text = r.text.encode('utf8') # Get the image list urls_pattern = 'picAy\[\d+\] = "(.*?)"' urls = re.findall(urls_pattern, text) self.urls = [self.BASE_URL + comic_url for comic_url in urls] # Get the comic name name_pattern = 'comicName = "(.*?)"' self.name = re.findall(name_pattern, text)[0] # Get the links of prev/next chapter def _get_url(pattern): url = re.findall(pattern, text)[0] if 'javascript' in url: return None return self.BASE_URL + url prev_pattern = 'preVolume="(.*?)"' next_pattern = 'nextVolume="(.*?)"' self.prev_url = _get_url(prev_pattern) self.next_url = _get_url(next_pattern)
def parsestring(s): ht={} import string from lxml import html import re doc = html.document_fromstring(s) def getWords(text): return re.compile('\w+').findall(text) text_doc = doc.text_content() #print text_doc s = text_doc.lower() # all lowercase s = re.sub('<[^>]*>', '', s) # removes <something> tags. #print s s = s.translate(string.maketrans("",""), string.digits) s = ' '.join(getWords(s)) # seperates out only words #print s #s = s.translate(string.maketrans("",""), string.punctuation) #s=' '.join(s.split(',')) #s= ' '.join(s.split('.')) s= s.split() #print s list_stopw=getstopwords(s) #remove stopwords #print list_stopw for i in s: if i not in list_stopw: #i=stem(i) #stemming algorithm ht[i]=ht.get(i,0)+1 return ht
def sanitize(input, cleaner=DocumentCleaner, wrap='p'): """Cleanup markup using a given cleanup configuration. Unwrapped text will be wrapped with wrap parameter. """ if 'body' not in cleaner.allow_tags: cleaner.allow_tags.append('body') input = six.u("<html><body>%s</body></html>") % input document = html.document_fromstring(input) bodies = [e for e in document if html._nons(e.tag) == 'body'] body = bodies[0] cleaned = cleaner.clean_html(body) remove_empty_tags(cleaned) strip_outer_breaks(cleaned) if wrap is not None: if wrap in html.defs.tags: wrap_text(cleaned, wrap) else: raise ValueError( 'Invalid html tag provided for wrapping the sanitized text') output = six.u('').join([etree.tostring(fragment, encoding=six.text_type) for fragment in cleaned.iterchildren()]) if wrap is None and cleaned.text: output = cleaned.text + output return output
def make_data_and_cookies(): """make the post data(including vcode) and get cookies""" vcode = '' while len(vcode) is not 4: r = requests.get(MAIN_URL) doc = html.document_fromstring(r.text) vcode_link = doc.cssselect('form img')[0].get('src') #print vcode_link vcv = doc.cssselect('input[name="vcv"]')[0].get('value') img_url = BASE_URL + vcode_link #print vcv img = requests.get(img_url) # write to the image file with open(IMG_PATH, 'w') as f: f.write(img.content) fh = open(IMG_PATH, 'rb') imgstring = fh.read() fh.close() data = { "picstring" : imgstring } re = requests.post("http://202.117.120.235/server.php", data=data) vcode = re.text data = { "account": USERNAME, "password": PASSWORD, "vcode": vcode, "vcv": vcv } return data, r.cookies
def home(request): if request.method == 'POST': form = URLForm(request.POST) if form.is_valid(): url = form.cleaned_data['url'] return redirect('/?q=' + url) else: url = request.GET.get('q') url = check_url(url) if url: page, content = download_page(url) if content == 'text/html': form = URLForm() doc = html.document_fromstring(page) title = get_title(doc) doc = replace_links(doc, url) head = get_head(doc) body = get_body(doc) context = {'form': form, 'head': head, 'body': body, 'title': title} return render(request, 'page.html', context, context_instance=RequestContext(request)) else: return HttpResponse(page, content_type=content) else: form = URLForm() return render(request, 'home.html', {'form': form}, context_instance=RequestContext(request))
def ParseHtml(story, corpus): """Parses the HTML of a news story. Args: story: The raw Story to be parsed. corpus: Either 'cnn' or 'dailymail'. Returns: A Story containing URL, paragraphs and highlights. """ parser = html.HTMLParser(encoding=chardet.detect(story.html)['encoding']) tree = html.document_fromstring(story.html, parser=parser) # Elements to delete. delete_selectors = { 'cnn': [ '//blockquote[contains(@class, "twitter-tweet")]', '//blockquote[contains(@class, "instagram-media")]' ], 'dailymail': [ '//blockquote[contains(@class, "twitter-tweet")]', '//blockquote[contains(@class, "instagram-media")]' ] } # Paragraph exclusions: ads, links, bylines, comments cnn_exclude = ( 'not(ancestor::*[contains(@class, "metadata")])' ' and not(ancestor::*[contains(@class, "pullquote")])' ' and not(ancestor::*[contains(@class, "SandboxRoot")])' ' and not(ancestor::*[contains(@class, "twitter-tweet")])' ' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])' ' and not(contains(@class, "cnnTopics"))' ' and not(descendant::*[starts-with(text(), "Read:")])' ' and not(descendant::*[starts-with(text(), "READ:")])' ' and not(descendant::*[starts-with(text(), "Join us at")])' ' and not(descendant::*[starts-with(text(), "Join us on")])' ' and not(descendant::*[starts-with(text(), "Read CNNOpinion")])' ' and not(descendant::*[contains(text(), "@CNNOpinion")])' ' and not(descendant-or-self::*[starts-with(text(), "Follow us")])' ' and not(descendant::*[starts-with(text(), "MORE:")])' ' and not(descendant::*[starts-with(text(), "SPOILER ALERT:")])') dm_exclude = ('not(ancestor::*[contains(@id,"reader-comments")])' ' and not(contains(@class, "byline-plain"))' ' and not(contains(@class, "byline-section"))' ' and not(contains(@class, "count-number"))' ' and not(contains(@class, "count-text"))' ' and not(contains(@class, "video-item-title"))' ' and not(ancestor::*[contains(@class, "column-content")])' ' and not(ancestor::iframe)') paragraph_selectors = { 'cnn': [ '//div[contains(@class, "cnnContentContainer")]//p[%s]' % cnn_exclude, '//div[contains(@class, "l-container")]//p[%s]' % cnn_exclude, '//div[contains(@class, "cnn_strycntntlft")]//p[%s]' % cnn_exclude ], 'dailymail': ['//div[contains(@class, "article-text")]//p[%s]' % dm_exclude] } # Highlight exclusions. he = ('not(contains(@class, "cnnHiliteHeader"))' ' and not(descendant::*[starts-with(text(), "Next Article in")])') highlight_selectors = { 'cnn': [ '//*[contains(@class, "el__storyhighlights__list")]//li[%s]' % he, '//*[contains(@class, "cnnStryHghLght")]//li[%s]' % he, '//*[@id="cnnHeaderRightCol"]//li[%s]' % he ], 'dailymail': ['//h1/following-sibling::ul//li'] } def ExtractText(selector): """Extracts a list of paragraphs given a XPath selector. Args: selector: A XPath selector to find the paragraphs. Returns: A list of raw text paragraphs with leading and trailing whitespace. """ xpaths = map(tree.xpath, selector) elements = list(chain.from_iterable(xpaths)) paragraphs = [e.text_content().encode('utf-8') for e in elements] # Remove editorial notes, etc. if corpus == 'cnn' and len( paragraphs) >= 2 and '(CNN)' in paragraphs[1]: paragraphs.pop(0) paragraphs = map(str.strip, paragraphs) paragraphs = [s for s in paragraphs if s and not str.isspace(s)] return paragraphs for selector in delete_selectors[corpus]: for bad in tree.xpath(selector): bad.getparent().remove(bad) paragraphs = ExtractText(paragraph_selectors[corpus]) highlights = ExtractText(highlight_selectors[corpus]) content = '\n\n'.join(paragraphs) return Story(story.url, content, highlights)
def _activate_form(self, response, form_index=0, inputs={}): from lxml import html # Extract form from response root = html.document_fromstring(response.content) form = root.forms[form_index] # Construct base url if "://" in form.action: url = form.action elif form.action[0] == "/": url = '/'.join(response.url.split('/', 3)[:3]) + form.action else: url = response.url.rsplit('/', 1)[0] + '/' + form.action # Identify payload format if form.method == 'GET': fmt = 'get' # get(url, params=payload) elif form.method == 'POST': if 'enctype' in form.attrib: if form.attrib['enctype'] == 'multipart/form-data': fmt = 'multipart/form-data' # post(url, files=payload) elif form.attrib[ 'enctype'] == 'application/x-www-form-urlencoded': fmt = 'application/x-www-form-urlencoded' # post(url, data=payload) else: fmt = 'post' # post(url, params=payload) # Extract payload from form payload = [] for form_input in form.inputs: key = form_input.name value = None is_file = False if isinstance(form_input, html.InputElement): value = form_input.value if 'type' in form_input.attrib: is_file = (form_input.attrib['type'] == 'file') elif isinstance(form_input, html.SelectElement): if isinstance(form_input.value, html.MultipleSelectOptions): value = [] for v in form_input.value: value += [v] else: value = form_input.value if value is None: value = form_input.value_options[0] if key in inputs.keys(): value = "{0}".format(inputs[key]) if (key is not None) and (value is not None): if fmt == 'multipart/form-data': if is_file: payload += [(key, ('', '', 'application/octet-stream')) ] else: if type(value) is list: for v in value: payload += [(key, ('', v))] else: payload += [(key, ('', value))] else: if type(value) is list: for v in value: payload += [(key, v)] else: payload += [(key, value)] # Send payload if fmt == 'get': response = self.request("GET", url, params=payload) elif fmt == 'post': response = self.request("POST", url, params=payload) elif fmt == 'multipart/form-data': response = self.request("POST", url, files=payload) elif fmt == 'application/x-www-form-urlencoded': response = self.request("POST", url, data=payload) return response
def page_html(self): return html.document_fromstring(self.response.text)
args = parser.parse_args() year = args.year headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0' } try: os.mkdir(f'{year}_AER') except: pass url = 'https://www.aeaweb.org/journals/aer/issues' a = html.document_fromstring(requests.get(url, headers=headers).text) issues = [ ref for ref, vol in zip(a.xpath('//a[@href]/@href'), a.xpath('//a[@href]/text()')) if 'issues' in ref and str(year) in vol ] for issue in issues: url2 = 'https://www.aeaweb.org' + issue print(url2) b = html.document_fromstring(requests.get(url2, headers=headers).text) [x for x in b.xpath('//a[@href]/text()') if 'articles?' in x]
def get_csrf_token(session, base_url): csrf_response = session.get(base_url + '/login') tree = html.document_fromstring(csrf_response.content) return tree.xpath("//input[contains(@name, '_csrf_token')]")[0].value
async def parse_ea44(self, session: aiohttp.ClientSession, order: Data) -> Data: """ парсит данные одной закупки """ # проверка на наличие записи в базе data = self.db_session.query(Data).filter(Data.id == order.id).all() if data: return data[0] text = await self._get_request(session, order.tender_link) order_document = document_fromstring(text) # парсим главную информацию о закупке - номер цена заказчик дата card_info_container = order_document.cssselect('.cardMainInfo')[0] tender_id = card_info_container.cssselect( '.cardMainInfo__purchaseLink') if not tender_id: tender_id = '' else: tender_id = self._normalizer(tender_id[0].text_content()) tender_object = card_info_container.xpath( './div[1]/div[2]/div[1]/span[2]') if not tender_object: tender_object = '' else: tender_object = self._normalizer(tender_object[0].text_content()) customer = card_info_container.xpath('./div[1]/div[2]/div[2]/span[2]') if not customer: customer = '' else: customer = self._normalizer(customer[0].text_content()) tender_price = card_info_container.cssselect('.cost') if not tender_price: tender_price = '' else: tender_price = self._normalizer(tender_price[0].text_content()) tender_date = card_info_container.xpath( './div[2]/div[2]/div[1]/div[1]/span[2]') if not tender_date: tender_date = '' else: tender_date = self._normalizer(tender_date[0].text_content()) # общая информация о закупке - адресс электронной площадки и обьект закупки general_information_container = order_document.xpath( '//div[@class="wrapper"]/div[2]') tender_adress = general_information_container[0].xpath( './/div[@class="col"]/section[3]/span[2]') if not tender_adress: tender_adress = '' else: tender_adress = self._normalizer(tender_adress[0].text_content()) # условия контракта condition_container = self.get_cotract_conditions_container( order_document.xpath('//div[@id="custReqNoticeTable"]/div')) if condition_container is not None: tender_delivery_adress = condition_container.xpath( './/div[@class="col"]/section[2]/span[2]') if not tender_delivery_adress: tender_delivery_adress = '' else: tender_delivery_adress = self._normalizer( tender_delivery_adress[0].text_content()) tender_term = condition_container.xpath( './/div[@class="row"]/section[3]/span[2]') if not tender_term: tender_term = '' else: tender_term = self._normalizer(tender_term[0].text_content()) else: tender_delivery_adress = '' tender_term = '' # парсинг информации о обьекте закупки tender_object_info = self.parse_tender_object_info(order_document) # парсинг победителя try: winner = await self.parse_tender_winner(session, order.tender_link) except Exception: winner = [] if len(winner) < 3: winner = ['', '', ''] # парсинг ссылок документов term_document_link = order.tender_link.replace('common-info', 'documents') term_document_data = await self._get_request(session, term_document_link) term_document_links = document_fromstring(term_document_data).xpath( '//span[@class="section__value"]/a[@title]/@href') order.tender_object = tender_object order.customer = customer order.tender_price = self._tender_price_handler(tender_price) order.tender_date = self._tender_date_handler(tender_date) order.tender_adress = tender_adress order.tender_delivery = tender_delivery_adress order.tender_term = tender_term for object_info in self._handle_tender_objects(tender_object_info): order.objects.append(object_info) for document_link_data in term_document_links: tender_link = TenderLinks() tender_link.link = document_link_data tender_link.data_id = order.id order.document_links.append(tender_link) order.winner.append( Winners(name=winner[0], position=winner[1], price=winner[2])) order.type = 'fz44' return order
# -*- coding: utf-8 -*- import sys import os import time from lxml import etree, html # 设置utf-8 unicode环境 reload(sys) sys.setdefaultencoding('utf-8') # htm文件路径,以及读取文件 path = "1.htm" content = open(path, "rb").read() page = html.document_fromstring(content) # 解析文件 text = page.text_content() # 去除所有标签 print text # 输出去除标签后解析结果
def s_lxml(self): tree = html.document_fromstring(self.page) self.tree = tree #xpath treee return tree
def document_fromstring(string): return _html.document_fromstring(string, parser=utf8parser)
from lxml import html import urllib import requests URL = "http://goalkicker.com/" response = requests.get(URL) sourceCode = response.content htmlElem = html.document_fromstring(sourceCode) books = htmlElem.cssselect('[class="bookContainer grow"]') for book in books: urlSuffix = book[0].get('href') response = requests.get(URL + urlSuffix) sourceCode = response.content htmlElem = html.document_fromstring(sourceCode) download = htmlElem.cssselect('[id="footer"]') pdfTitle = download[0][0].get('onclick')[15:-1] link = URL + urlSuffix + pdfTitle downloadDir = pdfTitle urllib.request.urlretrieve(link, downloadDir) #download the pdf print(pdfTitle)
def getFanficInfo(url): try: ffinfo = {} r = req.get(url) if not check404(r): doc = html.document_fromstring(r.text) # Тут все предельно ясно: получаем нужную инфу путем извлечения ее через xpath ffinfo['id'] = url[url.find('ficbook.net/readfic/')+20:] ffinfo['name'] = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/h1[1]/text()')[0].strip() ffinfo['author'] = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a[1]/text()')[0] ffinfo['authorlink'] = 'http://ficbook.net' + doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a[1]/@href')[0] ffinfo['likes'] = int(doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/div[1]/div[1]/text()')[0].strip().replace('+','')) ffinfo['description'] = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[2]/span[1]/text()')[0].strip() warings = [] genders = [] # Разделение на жанры и предупреждения. # Не знаю зачем оно, но пусть будет. genders_buf = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a') for g in genders_buf: try: href = g.attrib['href'] title = g.attrib['title'] except KeyError: continue if not href in ratings and href.startswith('/ratings/'): title = title[3:title.find('</b>')] if title in genders_list: genders.append(title) elif title in warings_list: warings.append(title) ffinfo['genders'] = genders ffinfo['warings'] = warings # А тут фэндом, размер и рейтинг # ------------------------------- # -- Дайте один NC, пожалуйста. # -- Вам оридж или по Лаки Стар? # -- А по ЛС какие жанры? # -- Только юри. # -- Ахуеть, дайте два! # ------------------------------- buf_list = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a') for item in buf_list: buf_link = '/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a[@href = "' + str(item.get('href')) + '"]' if doc.xpath(buf_link + '/text()'): value = doc.xpath(buf_link + '/text()')[0] .strip() if '/fanfiction/' in str(item.get('href')): ffinfo['fandom'] = value elif str(item.get('href')) in ratings: ffinfo['rating'] = value elif '/sizes/' in str(item.get('href')): ffinfo['size'] = value if doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div[@class="part_list"]'): parts = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div[@class="part_list"]/a/@href') parts_urls = list(map(lambda x: 'http://ficbook.net' + x.replace('#part_content', ''), parts)) else: parts_urls = [url] ffinfo['parts'] = parts_urls return ffinfo else: QtGui.QMessageBox.information(None, 'Error 404', 'Такой страницы не существует.') except Exception as e: showErrorMessage(e)
# SPDX-License-Identifier: MIT # Copyright (C) 2020 Tobias Gruetzmacher from lxml import html from dosagelib.xml import NS import httpmocks tree = html.document_fromstring(httpmocks.content('zp-222')) class TestXML: def xpath(self, path): return tree.xpath(path, namespaces=NS) def test_class_ext(self): assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1 assert len(self.xpath('//ul[d:class("menu")]')) == 1 assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2 assert len(self.xpath('//li[d:class("menu-item")]')) == 25 def test_re_ext(self): assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1
def search_title(title): if not LMXL: return None data = get_page("/index.php?first=no&what=&kp_query=", 1, title.decode('utf8')) doc = html.document_fromstring(data) search_results = [] #Проверяем ту ли страницу (т.е. страницу с результатами поиска) мы получили regexp = re.compile(unicode("Скорее всего, вы ищете:", "utf8"), re.DOTALL) result = regexp.search(data) #if result == None: #Если не ту, то парсим страницу фильма на которую нас перенаправил кинопоиск # titlestr = doc.xpath("//h1[@class='moviename-big']") [0].text.strip() # try: # title = '%s' % (normilize_string(titlestr)) # except: # title = 'None' # try: # idstr = '\nid:%s' % (doc.xpath("//link[@rel='canonical']") [0].attrib["href"].split("/")[-2]) # except: # idstr = '\nid:n/a' # cur_movie = (title.encode("utf-8"), idstr.encode("utf-8")) # search_results.append(cur_movie) #print '%s' % (search_results) # return search_results if result: titleNodes = doc.xpath( "//div[@class='search_results' or @class='search_results search_results_simple']/div[@class='element most_wanted' or @class='element']/div[@class='info']" ) for titleNode in titleNodes: yearInfo = titleNode.xpath("p//span[@class='year']/text()") titleInfo = titleNode.xpath("p[@class='name']/a") #rateInfo = titleNode.xpath("div[@class='rating']/text()") genreInfo = titleNode.xpath("span[@class='gray']/text()") try: year = yearInfo[0] except: year = 'n/a' try: title = '%s (%s)' % (normilize_string(titleInfo[0].text), year) except: title = 'none' try: id = '\nid:%s' % (titleInfo[0].attrib["data-id"]) except: id = '\nid:n/a' #try: # rate = '\n%s' % (rateInfo[0]) #except: # rate = '' try: genre = '\n%s\n%s\n%s' % ( normilize_string(genreInfo[0]), normilize_string(genreInfo[1].replace(',', '').replace( '...', '')), normilize_string(genreInfo[3])) except: genre = '' search = (title.encode("utf-8"), id.encode("utf-8"), genre.encode("utf-8")) search_results.append(search) #print '%s %s %s\n' % (title, id, genre) return search_results
def load_offline(self, path): with open(path, 'r') as f: s = f.read() self.html = html.document_fromstring(s)
} return obj_template base_url = 'https://www.tripadvisor.com/Restaurants-g274967-Riga_Riga_Region.html' opened_page = 1 # linux chromedriver path: /mnt/c/chromedriver.exe # windows chromedriver path: C:\\chromedriver.exe browser = webdriver.Chrome("C:\\chromedriver.exe") browser.get(base_url) time.sleep(5) content = browser.page_source trip_content = html.document_fromstring(content) contents_seen = 0 pages_total = get_number_of_pages(trip_content) main_window = browser.current_window_handle while opened_page <= pages_total: contents_seen = 0 # get the number of restaurant links on given search results page page_content_count = get_number_of_page_links(trip_content) while contents_seen < page_content_count: # get the list of currently opened page restaurant list items all_restaurants_link_elements = browser.find_elements( By.XPATH, '//a[@class="photo_link"]')
def parse(self, html_src: str) -> 'list[Result]': """ Parses an html document for a given XPath expression. Any resulting node can optionally be filtered against a regular expression """ # In the following some xpath extension functions are introduced. They can be used in the xpath fields of a task's selectors # def textify(node): return (str(node.text) if hasattr(node, "text") else str(node)).strip() def merge_lists(context, *args): """ Merge the items of lists at same positions. If one list is shorter, its last element is repeated """ try: return [" ".join([textify(arg[min(i, len(arg) - 1)]) for arg in args]) for i in range(max(map(len, args)))] except Exception as e: return [""] def exe(context, nodes, path): """ Executes a given xpath with each node in the first xpath as context node """ try: return [textify(node.xpath(path).pop()) if node.xpath(path) else "" for node in nodes] except Exception as e: return [""] def all(context, nodes): return [" ".join(textify(node) for node in nodes)] ns = etree.FunctionNamespace(None) ns['merge_lists'] = merge_lists ns['exe'] = exe ns['all'] = all if not self.selectors.all(): return html_src # nothing to do parsed_tree = html.document_fromstring(html_src) selectors_results = [] for selector in self.selectors.all(): nodes = parsed_tree.xpath(selector.xpath) nodes = [textify(node) for node in nodes] if selector.regex: # Apply regex to every single node # selector_results = [] for node in nodes: node = str(node) regex_result = re.search(selector.regex, node, re.DOTALL | re.UNICODE) if regex_result: if regex_result.groups(): selector_results += [regex_result.groups()[-1]] else: selector_results += [regex_result.group()] else: selector_results += [None] else: selector_results = nodes selector_results = [selector.cast(data) if data is not None else None for data in selector_results] # cast to type selectors_results.append(selector_results) # convert selector results from a tuple of lists to a list of tuples # results = [] for y in range(max([len(selectors_results[list(self.selectors.all()).index(key_selector)]) for key_selector in self.selectors.all() if key_selector.is_key])): # Take as many results, as there are results for a key selector result = Result(task_id=self.name) for x, selector in enumerate(self.selectors.all()): selectors_results[x] = selectors_results[x] or [None] # Guarantee that an element is there setattr(result, selector.name, selectors_results[x][min(y, len(selectors_results[x]) - 1)]) result.key = result.get_key() if result.key: results.append(result) return results
def page(request, slug): my_list = [] page = HelpPages.objects.get(slug=slug) left_menu = get_left_menu(page) page_title = page.title anchors = None if page.content: content_html = html.document_fromstring(page.content) anchors = content_html.cssselect("h2") setted_anchors = [] for anchor in anchors: if anchor.text and anchor.text != "": anchor_link = slugify(anchor.text, allow_unicode=True) if anchor_link in setted_anchors: anchor_link = "{}_".format(anchor_link) anchor.attrib['id'] = anchor_link anchor.insert( 0, etree.XML('<a href="#{}" class="anchor">#</a>'.format( anchor_link))) setted_anchors.append(anchor_link) page.content = html.tostring( content_html, encoding='unicode', pretty_print=True, ) sub_pages = HelpPages.objects.filter(parent_page=page) # Next Page if HelpPages.objects.filter(parent_page_id=page.parent_page_id, status=1, tree_id__gt=page.tree_id).exists(): next_page = HelpPages.objects.filter( parent_page_id=page.parent_page_id, status=1, tree_id__gt=page.tree_id)[0] elif page.parent_page and HelpPages.objects.filter( parent_page=None, status=1, tree_id__gt=page.parent_page.tree_id).exists(): next_page = HelpPages.objects.filter( parent_page=None, status=1, tree_id__gt=page.parent_page.tree_id)[0] if HelpPages.objects.filter(parent_page_id=page.id, status=1).exists(): next_page = HelpPages.objects.filter(parent_page_id=page.id, status=1)[0] # Previous Page if HelpPages.objects.filter(parent_page_id=page.parent_page_id, status=1, tree_id__lt=page.tree_id).exists(): prev_page = HelpPages.objects.filter( parent_page_id=page.parent_page_id, status=1, tree_id__lt=page.tree_id).last() if HelpPages.objects.filter(parent_page_id=prev_page.id, status=1).exists(): prev_page = HelpPages.objects.filter(parent_page_id=prev_page.id, status=1).last() elif page.parent_page: prev_page = page.parent_page else: prev_page = {'title': "Помощь"} return render(request, "page/index.html", locals())
self._task = task self._body = body self._body_type = None self.items = dict() self.tasks = list() self._md5_mk = hashlib.md5() if body[0] == '{' and body[-1] == '}' or body[0] == '[' and body[ -1] == ']': try: self._json_dict = json.loads(body) except Exception, e: print(e) else: self._body_type = self.JSON else: self._doc = html.document_fromstring(body) self._body_type = self.HTML self._parse() def _parse(self): pass def _xpath(self, xp): if self._body_type == self.HTML: return self._doc.xpath(xp) return None def _get(self, key, default=None): if self._body_type == self.JSON: return self._json_dict.get(key, default) return None
from lxml import html import requests import math def prod(l): a = 1 for term in l: a *= int(term) return a page = requests.post('https://projecteuler.net/problem=8', verify=False) data = html.document_fromstring(page.text) num = data.xpath( '//p[@style="font-family:courier new;text-align:center;"]/text()') num = ''.join(num).replace('\r', '').replace('\n', '') res = [] for i, n in enumerate(num): if i + 13 < len(num): r = num[i:i + 13] res.append(prod(r)) else: break print max(res)
def update_page_info_module(course_id, page_name): # Use the Canvas API to GET the page #GET /api/v1/courses/:course_id/pages/:url url = baseUrl + '%s/pages/%s' % (course_id, page_name) if Verbose_Flag: print(url) payload = {} r = requests.get(url, headers=header, data=payload) if r.status_code == requests.codes.ok: page_response = r.json() if Verbose_Flag: print("body: {}".format(page_response["body"])) document = html.document_fromstring(page_response["body"]) raw_text = document.text_content() print("raw_text: {}".format(raw_text)) title = page_response["title"] else: print("No page {}".format(page_name)) return False # transform page GQMContent = document.xpath('//p[@class="GQMContent"]') if len(GQMContent) > 0: text_of_GQMContent = GQMContent[0].text print("Existing information as text is {}".format(text_of_GQMContent)) information_for_on_page = json.loads(text_of_GQMContent) print("Existing information is {}".format(information_for_on_page)) document2 = deepcopy(document) # trim off GQMContent paragraph before processing the raw_text for elem in document2.xpath('//p[@class="GQMContent"]'): elem.getparent().remove(elem) raw_text = document2.text_content() print("raw_text: {}".format(raw_text)) information_for_on_page["Words"] = len(raw_text.split()) information_for_on_page["Characters"] = len(raw_text) # see http://www.erinhengel.com/software/textatistic/ information_for_on_page["Textatistic.counts"] = Textatistic( raw_text).counts information_for_on_page["Textatistic.statistics"] = Textatistic( raw_text).dict() if len(GQMContent) == 0: #no GQMContent found on this page so add some print("No GQMContent found - adding some") body = document.find('.//body') if body == None: print("page has no <body>") else: GQMContent_string = '<p class="GQMContent">' + json.dumps( information_for_on_page) + "</p>" body.append(html.etree.XML(GQMContent_string)) print("initial updated document {}", format(html.tostring(document))) else: GQMContent[0].text = json.dumps(information_for_on_page) print("updated document {}", format(html.tostring(document))) # Use the Canvas API to insert the page #PUT /api/v1/courses/:course_id/pages/:uid # wiki_page[title] # wiki_page[published] # wiki_page[body] url = baseUrl + '%s/pages/%s' % (course_id, page_name) if Verbose_Flag: print(url) payload = { 'wiki_page[title]': title, 'wiki_page[published]': False, 'wiki_page[body]': str(html.tostring(document, pretty_print=True, method="html"), 'utf-8') } r = requests.put(url, headers=header, data=payload) write_to_log(r.text) print("status code {}".format(r.status_code)) if r.status_code == requests.codes.ok: return True else: print("Unable to update page {}".format(page_name)) return False
def compile(self, content): self._parser_content = HTML.document_fromstring(content)
def parse_ea44(self, link): inform_request = self.session.get(link) inform_request.raise_for_status() order_document = document_fromstring(inform_request.text) # парсим главную информацию о закупке - номер цена заказчик дата card_info_container = order_document.cssselect('.cardMainInfo')[0] tender_id = card_info_container.cssselect( '.cardMainInfo__purchaseLink') if not tender_id: tender_id = '' else: tender_id = self._normalizer(tender_id[0].text_content()) tender_object = card_info_container.xpath( './div[1]/div[2]/div[1]/span[2]') if not tender_object: tender_object = '' else: tender_object = self._normalizer(tender_object[0].text_content()) customer = card_info_container.xpath('./div[1]/div[2]/div[2]/span[2]') if not customer: customer = '' else: customer = self._normalizer(customer[0].text_content()) tender_price = card_info_container.cssselect('.cost') if not tender_price: tender_price = '' else: tender_price = self._normalizer(tender_price[0].text_content()) tender_date = card_info_container.xpath( './div[2]/div[2]/div[1]/div[1]/span[2]') if not tender_date: tender_date = '' else: tender_date = self._normalizer(tender_date[0].text_content()) # общая информация о закупке - адресс электронной площадки и обьект закупки general_information_container = order_document.xpath( '//div[@class="wrapper"]/div[2]') tender_adress = general_information_container[0].xpath( './/div[@class="col"]/section[3]/span[2]') if not tender_adress: tender_adress = '' else: tender_adress = self._normalizer(tender_adress[0].text_content()) # условия контракта condition_container = self._get_cotract_conditions_container( order_document.xpath('//div[@id="custReqNoticeTable"]/div')) if condition_container is not None: tender_delivery_adress = condition_container.xpath( './/div[@class="col"]/section[2]/span[2]') if not tender_delivery_adress: tender_delivery_adress = '' else: tender_delivery_adress = self._normalizer( tender_delivery_adress[0].text_content()) tender_term = condition_container.xpath( './/div[@class="row"]/section[3]/span[2]') if not tender_term: tender_term = '' else: tender_term = self._normalizer(tender_term[0].text_content()) else: tender_delivery_adress = '' tender_term = '' # парсинг информации о обьекте закупки tender_object_info = self._parse_tender_object_info(order_document) # парсинг победителя winner = self._parse_tender_winner(link) if len(winner) < 3: winner = ['', '', ''] # парсинг ссылок документов term_document_link = link.replace('common-info', 'documents') term_document_data = self.session.get(term_document_link) term_document_data.raise_for_status() term_document_links = document_fromstring(term_document_data.text).xpath( '//span[@class="section__value"]/a[@title]/@href') return { 'tender_id': tender_id, 'tender_object': tender_object, 'customer': customer, 'tender_price': tender_price, 'tender_date': tender_date, 'tender_adress': tender_adress, 'tender_delivery': tender_delivery_adress, 'tender_term': tender_term, 'tender_object_info': tender_object_info, 'document_links': term_document_links, 'tender_winner': winner, 'type': 'fz44', 'link': link }
return json.dumps({"error": message}, indent=4, sort_keys=True) base_wikipedia_url = "https://en.wikipedia.org" # Throw an error for command-line arguments. if len(sys.argv) != 3: print error_message("Requires 2 command-line arguments: url and number.") sys.exit() # Extract command-line arguments. url = sys.argv[1] number = int(sys.argv[2]) # Make request and extract table. response = requests.get(url) tree = html.document_fromstring(response.text) table = tree.xpath('//table[@class="infobox"]') # Throw an error if table is not found. if len(table) == 0: print error_message("No cabinet tables found for this presidency.") sys.exit() # Retrieve appropriate table. # For all presidents, this will be the first table except for Grover Cleveland. # Because he had two non-consecutive terms, we must check which term we're referring to. # His second term is on a second table. # # TODO: Make this more generic such that non-consecutive terms can be handled generically. else: table = table[ 1 if number == 24 else 0 ]
def main(): with open(filename) as bookmarks_file: data = bookmarks_file.read() geolocator = Nominatim() kml = simplekml.Kml() lst = list() # Hacky and doesn't work for all of the stars: lat_re = re.compile('markers:[^\]]*latlng[^}]*lat:([^,]*)') lon_re = re.compile('markers:[^\]]*latlng[^}]*lng:([^}]*)') coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)') doc = document_fromstring(data) for element, attribute, url, pos in doc.body.iterlinks(): if 'maps.google' in url: description = element.text or '' print description.encode('UTF8') if coords_in_url.search(url): # Coordinates are in URL itself latitude = coords_in_url.search(url).groups()[0] longitude = coords_in_url.search(url).groups()[1] else: # Load map and find coordinates in source of page try: sock = urlopen(url.replace(' ', '+').encode('UTF8')) except Exception, e: print 'Connection problem:' print repr(e) print 'Waiting 2 minutes and trying again' time.sleep(120) sock = urlopen(url.replace(' ', '+').encode('UTF8')) content = sock.read() sock.close() time.sleep(3) # Don't annoy server try: latitude = lat_re.findall(content)[0] longitude = lon_re.findall(content)[0] except IndexError: try: lines = content.split( '\n') # --> ['Line 1', 'Line 2', 'Line 3'] for line in lines: if re.search('cacheResponse\(', line): splitline = line.split('(')[1].split( ')')[0] + '"]' # in the future we can extract the coordinates from here null = None values = eval(splitline) print values[8][0][1] longitude = str(values[0][0][1]) latitude = str(values[0][0][2]) continue except IndexError: print '[Coordinates not found]' continue print print latitude, longitude try: location = geolocator.reverse(latitude + ", " + longitude) print(location.address) except ValueError: print '[Invalid coordinates]' print kml.newpoint(name=description, coords=[(float(longitude), float(latitude))]) lst.append({ 'latitude': latitude, 'longitude': longitude, 'name': description, 'url': url.encode(encoding='utf-8', errors='replace'), 'address': location.address.encode(encoding='utf-8', errors='replace') if location else 'error' }) # this is here because there's a tendancy for this script to fail part way through... # so at least you can get a partial result kml.save("GoogleBookmarks.kml") with open('GoogleBookmarks.json', mode='w') as listdump: listdump.write(json.dumps(lst)) sys.stdout.flush()
def parse_html(self, html_source): import lxml.html as HTML root = HTML.document_fromstring(html_source) with open('test.html','w') as f: f.write(html_source) class_name = re.findall(r'<li id="result_\d+".*? class="(.*?)"', html_source) if class_name: class_name = class_name[0] else: class_name = "s-result-item celwidget" print (class_name) pdivs = root.xpath(".//li[@class='"+class_name+"']") print ('len', len(pdivs)) products = [] if len(pdivs)==0: return products for pdiv in pdivs: try: product = {} ASIN = pdiv.xpath("./@data-asin") link = pdiv.xpath(".//a[@class='a-link-normal a-text-normal']/@href") image = pdiv.xpath(".//a[@class='a-link-normal a-text-normal']/img/@src") title = pdiv.xpath(".//h2[@class='a-size-base a-color-null s-inline s-access-title a-text-normal']/text()") price = pdiv.xpath(".//span[@class='a-color-price']/text()|.//span[@class='a-size-base a-color-price s-price a-text-bold']/text()") isfba = len(pdiv.xpath(".//i[@class='a-icon a-icon-prime a-icon-small s-align-text-bottom']")) review=pdiv.xpath(".//a[@class='a-size-small a-link-normal a-text-normal']/text()") try: product['ASIN'] = ASIN[0] except: product['ASIN'] = '' try: product['link'] = link[0] except: product['link'] = '' try: product['image'] = image[0] except: product['image'] = '' try: product['title'] = title[0] except: product['title'] = '' try: product['price'] = price[0] except: product['price'] = '0' product['iamge_list'] = [] product['isfba'] = isfba # can't find these code 20170117 # product_count=MarketProductsCandidates.objects.filter(product_id=ASIN[0],market__market_name="Amazon.com") # if product_count.exists(): # product['in_db']='True' # else: # product['in_db']='False' # productas=MarketProductAssignment.objects.filter(product__product_id=ASIN[0],product__market__market_name="Amazon.com") # if productas.exists(): # product['in_assign']='True' # else: # product['in_assign']='False' try: if review[-1]==' ': product['review']=0 else: if len(review)==0: product['review']=0 else: product['review']=int(review[-1].replace(',','')) except: product['review']=0 products.append(product) except Exception as e: print (e, 'location: parse_html') return products
def get_headers(self, product_ids): """ Get the headers associated to a list of data product IDs This method returns a `~astropy.table.Table` where the rows correspond to the provided data product IDs, and the columns are from each of the Fits headers keywords. Note: The additional column ``'DP.ID'`` found in the returned table corresponds to the provided data product IDs. Parameters ---------- product_ids : either a list of strings or a `~astropy.table.Column` List of data product IDs. Returns ------- result : `~astropy.table.Table` A table where: columns are header keywords, rows are product_ids. """ from lxml import html _schema_product_ids = schema.Schema( schema.Or(Column, [six.string_types])) _schema_product_ids.validate(product_ids) # Get all headers result = [] for dp_id in product_ids: response = self.request( "GET", "http://archive.eso.org/hdr?DpId={0}".format(dp_id)) root = html.document_fromstring(response.content) hdr = root.xpath("//pre")[0].text header = {'DP.ID': dp_id} for key_value in hdr.split('\n'): if "=" in key_value: [key, value] = key_value.split('=', 1) key = key.strip() value = value.split('/', 1)[0].strip() if key[0:7] != "COMMENT": # drop comments if value == "T": # Convert boolean T to True value = True elif value == "F": # Convert boolean F to False value = False # Convert to string, removing quotation marks elif value[0] == "'": value = value[1:-1] elif "." in value: # Convert to float value = float(value) else: # Convert to integer value = int(value) header[key] = value elif key_value.find("END") == 0: break result += [header] # Identify all columns columns = [] column_types = [] for header in result: for key in header.keys(): if key not in columns: columns += [key] column_types += [type(header[key])] # Add all missing elements for i in range(len(result)): for (column, column_type) in zip(columns, column_types): if column not in result[i]: result[i][column] = column_type() # Return as Table return Table(result)
def perform_url(self, url): """ Perform an article document by designated url :param url: web-page url of the document """ self.url = url self.title = self.image_url = self.language = self.description = \ self.clean_html = self.error_msg = self._charset = None if not self.url: self.error_msg = 'Empty or null URL to perform' return # get the page (bytecode) try: web_page = requests.get(self.url, headers=self._headers) # perform http status codes if web_page.status_code not in [200, 301, 302]: self.error_msg = str('HTTP Error. Status: %s' % web_page.status_code) return self.url = web_page.url raw_html = web_page.content self._charset = get_encoding(raw_html) raw_html_str = raw_html.decode(self._charset) # getting and cleaning the document self._source_html = document_fromstring(raw_html_str) self._source_html = html_cleaner.clean_html(self._source_html) # making links absolute self._source_html.make_links_absolute(self.url, resolve_base_href=True) except (ConnectionError, Timeout, TypeError, Exception) as e: self.error_msg = str(e) finally: if self.error_msg: return if self._source_html is not None: # obtaining title self.title = shorten_title(self._source_html) # obtaining image url self.image_url = get_image_url(self._source_html, self.url) if self.image_url is not None: image_url_node = "<meta itemprop=\"image\" content=\"%s\">" % self.image_url image_url_img = "<img src=\"%s\" />" % self.image_url else: image_url_node = image_url_img = "" # clean html self.clean_html = self._article_extractor.get_clean_html( source_html=self._source_html) # summarized description, requires clean_html if self.clean_html: self.description, self.language = get_plain_text( etree.XML(self.clean_html), self._summary_sentences_qty) description_node = "<meta name=\"description\" content=\"%s\">" if self.description else "" # filling the template self.clean_html = ARTICLE_TEMPLATE % { 'language': self.language, 'title': self.title, 'image_url_node': image_url_node, 'image_url_img': image_url_img, 'description_node': description_node, 'clean_html': self.clean_html }
def authenticate(self): config = ConfigParser.ConfigParser() config.read(TOKENS_FILE) if config.has_option("hubic", "refresh_token"): oauth_token = self._refresh_access_token() else: r = requests.get( OAUTH_ENDPOINT + 'auth/?client_id={0}&redirect_uri={1}' '&scope=credentials.r,account.r&response_type=code&state={2}'. format( quote(self._client_id), quote_plus(self._redirect_uri), pyrax.utils.random_ascii() # csrf ? wut ?.. ), allow_redirects=False) if r.status_code != 200: raise exc.AuthenticationFailed("Incorrect/unauthorized " "client_id (%s)" % str(self._parse_error(r))) try: from lxml import html as lxml_html except ImportError: lxml_html = None if lxml_html: oauth = lxml_html.document_fromstring( r.content).xpath('//input[@name="oauth"]') oauth = oauth[0].value if oauth else None else: oauth = re.search( r'<input\s+[^>]*name=[\'"]?oauth[\'"]?\s+[^>]*value=[\'"]?(\d+)[\'"]?>', r.content) oauth = oauth.group(1) if oauth else None if not oauth: raise exc.AuthenticationFailed( "Unable to get oauth_id from authorization page") if self._email is None or self._password is None: raise exc.AuthenticationFailed( "Cannot retrieve email and/or password. " "Please run expresslane-hubic-setup.sh") r = requests.post(OAUTH_ENDPOINT + 'auth/', data={ 'action': 'accepted', 'oauth': oauth, 'login': self._email, 'user_pwd': self._password, 'account': 'r', 'credentials': 'r', }, allow_redirects=False) try: query = urlparse.urlsplit(r.headers['location']).query code = dict(urlparse.parse_qsl(query))['code'] except: raise exc.AuthenticationFailed( "Unable to authorize client_id, " "invalid login/password ?") oauth_token = self._get_access_token(code) if oauth_token['token_type'].lower() != 'bearer': raise exc.AuthenticationFailed("Unsupported access token type") r = requests.get( API_ENDPOINT + 'account/credentials', auth=BearerTokenAuth(oauth_token['access_token']), ) swift_token = r.json() self.authenticated = True self.token = swift_token['token'] self.expires = swift_token['expires'] self.services['object_store'] = Service( self, { 'name': 'HubiC', 'type': 'cloudfiles', 'endpoints': [{ 'public_url': swift_token['endpoint'] }] }) self.username = self.password = None
def data_retrieval(self, datasets): """ Retrieve a list of datasets form the ESO archive. Parameters ---------- datasets : list of strings List of datasets strings to retrieve from the archive. Returns ------- files : list of strings List of files that have been locally downloaded from the archive. """ from lxml import html datasets_to_download = [] files = [] # First: Detect datasets already downloaded for dataset in datasets: local_filename = dataset + ".fits" if self.cache_location is not None: local_filename = os.path.join(self.cache_location, local_filename) if os.path.exists(local_filename): print("Found {0}.fits...".format(dataset)) files += [local_filename] elif os.path.exists(local_filename + ".Z"): print("Found {0}.fits.Z...".format(dataset)) files += [local_filename + ".Z"] else: datasets_to_download += [dataset] # Second: Download the other datasets if datasets_to_download: data_retrieval_form = self.request( "GET", "http://archive.eso.org/cms/eso-data/eso-data-direct-retrieval.html" ) print("Staging request...") with suspend_cache(self): # Never cache staging operations data_confirmation_form = self._activate_form( data_retrieval_form, form_index=-1, inputs={ "list_of_datasets": "\n".join(datasets_to_download) }) root = html.document_fromstring(data_confirmation_form.content) login_button = root.xpath('//input[@value="LOGIN"]') if login_button: raise LoginError( "Not logged in. You must be logged in to download data." ) # TODO: There may be another screen for Not Authorized; that should be included too data_download_form = self._activate_form( data_confirmation_form, form_index=-1) root = html.document_fromstring(data_download_form.content) state = root.xpath("//span[@id='requestState']")[0].text while state != 'COMPLETE': time.sleep(2.0) data_download_form = self.request("GET", data_download_form.url) root = html.document_fromstring(data_download_form.content) state = root.xpath("//span[@id='requestState']")[0].text print("Downloading files...") for fileId in root.xpath("//input[@name='fileId']"): fileLink = fileId.attrib['value'].split()[1] fileLink = fileLink.replace("/api", "").replace("https://", "http://") filename = self.request("GET", fileLink, save=True) files += [system_tools.gunzip(filename)] print("Done!") return files
def search_data(uid): def addMultiValues(dataNode, xpathTuple): result = '' temp_list = [] for xpathString in xpathTuple: if len(dataNode) and len(dataNode.xpath(xpathString)): for node in dataNode.xpath(xpathString): if node.text != "...": temp_list.append(node.text) result = ",".join(temp_list) return result try: filmdata = { 'title': '', 'countries': '', 'year': '', 'directors': '', 'cast': '', 'genre': '', 'duplicate': '', 'user_rating': '', 'rating_count': '', 'movie_rating': '', 'plot': '', 'runtime': '' #'url' : '', #'coverart' : '', #'fanart' : '' } data = get_page("/level/1/film/" + uid, 1) doc = html.document_fromstring(data) titleNodes = doc.xpath("//h1[@class='moviename-big']") if len(titleNodes): try: filmdata['title'] = titleNodes[0].text.strip() except: filmdata['title'] = '' userRatingNodes = doc.xpath("//div[@id='block_rating']/div/div/a/span") if len(userRatingNodes): try: filmdata['user_rating'] = userRatingNodes[0].text except: filmdata['user_rating'] = '' countRatingNodes = doc.xpath("//span[@class='ratingCount']") if len(countRatingNodes): try: filmdata['rating_count'] = normilize_string( countRatingNodes[0].text) except: filmdata['rating_count'] = '' infoNodes = doc.xpath("//table[@class='info']/*") for infoNode in infoNodes: dataNodes = infoNode.xpath("td") if dataNodes[0].text == u"год": try: filmdata['year'] = dataNodes[0].xpath( "//table[@class='info']//td//div//a/text()")[0] except: filmdata['year'] = '' elif dataNodes[0].text == u"страна": try: filmdata['countries'] = addMultiValues( dataNodes[1], ("div/a", "/a")) except: filmdata['countries'] = '' elif dataNodes[0].text == u"режиссер": try: filmdata['directors'] = addMultiValues( dataNodes[1], ("a", "/a")) except: filmdata['directors'] = '' elif dataNodes[0].text == u"жанр": try: film_data = addMultiValues(dataNodes[1], ("span/a", "/a")) filmdata['genre'] = film_data.replace('музыка', '').replace( 'слова', '') except: filmdata['genre'] = '' elif dataNodes[0].text == u"время": try: filmdata['runtime'] = dataNodes[1].text.split()[0] except: filmdata['runtime'] = '' elif dataNodes[0].text == u"рейтинг MPAA": try: filmdata['movie_rating'] = dataNodes[1].xpath( "a")[0].attrib["href"].split("/")[-2] except: filmdata['movie_rating'] = '' actorNodes = doc.xpath("//div[@id='actorList']/ul") if len(actorNodes): try: filmdata['cast'] = addMultiValues(actorNodes[0], ("a", "li/a")) except: filmdata['cast'] = '' duplicatedNodes = doc.xpath("//div[@id='actorList']/ul") if len(duplicatedNodes): try: duplicated = addMultiValues(duplicatedNodes[1], ("a", "li/a")) filmdata['duplicate'] = duplicated.replace( 'показать всех', '').replace('»', '') except: filmdata['duplicate'] = '' descNodes = doc.xpath( "//div[@class='brand_words film-synopsys' or @class='brand_words']['description']" ) if len(descNodes): try: filmdata['plot'] = normilize_string(descNodes[0].text) except: filmdata['plot'] = '' #posters = search_poster(uid) #if len(posters): # try: # filmdata['coverart'] = posters[0] # except: # filmdata['coverart'] = '' #fanarts = search_fanart(uid) #if len(fanarts): # try: # filmdata['fanart'] = fanarts[0] # except: # filmdata['fanart'] = '' #filmdata['url'] = "http://www.kinopoisk.ru/level/1/film/"+uid # print("""\ # Title:%(title)s # Year:%(year)s # Director:%(directors)s # Plot:%(plot)s # UserRating:%(user_rating)s # RatingCount:%(rating_count)s # Cast:%(cast)s # Duplicate:%(duplicate)s # Genres:%(genre)s # Countries:%(countries)s # Runtime:%(runtime)s # MovieRating:%(movie_rating)s # Coverart:%(coverart)s # Fanart:%(fanart)s # URL:%(url)s # """ % filmdata) return filmdata except: print_exception(traceback.format_exc())