def rando(): #clip off the last semicolon then split up the separate querries reqs = request.query_string.decode('utf-8')[:-1].split(";") querry = [] #make a list of querries to quinterest for r in reqs: querry.append(formatreq(r)) if(len(querry) > 25): querry = querry[:25] questions = [] for q in querry: out = get("http://quinterest.org{}".format(q)).text out = BeautifulSoup(out, 'html.parser').find_all(attrs={"class":"row"}) out.pop(0) for e in out: #insert the query and replace button at the end of the question querystr = q[23:] querystr = sub('amount=[0-9]+','amount=1',querystr) span = BeautifulSoup('<span class="subjTag" style="display:none"></span>').span repbutton = BeautifulSoup('<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>').button span.string = querystr e.div.append(span) e.div.append(repbutton) questions.append(str(e)) questions = processQuestions(questions) return ("<br>".join(questions))
def CleanFile(document): regex = re.compile('\\d{2}/\\d{2}/\\d{4},.\\d{2}:\\d{2}') CleanStartTime = time.time() cleaned = BeautifulSoup(document, "lxml").get_text() cleaned = cleaned.split(" ") if (len(cleaned) < 290): print( "Wrong type of file, please choose a facebook messenger history file." ) quit() else: for i in range(290): cleaned.pop(0) cleaned = " ".join(cleaned) cleanedwregex = re.split(regex, cleaned) listofdates = re.findall(regex, cleaned) CleanEndTime = time.time() print("HTML cleaned in " + str("%.2f" % (CleanEndTime - CleanStartTime)) + "seconds") PrintStartTime = time.time() gucciString = "" for i in range(len(cleanedwregex) - 1): gucciString += listofdates[i] + " | " + cleanedwregex[i] + "\n" print(gucciString) PrintEndTime = time.time() print("Printed to console in " + str("%.2f" % (PrintEndTime - PrintStartTime)) + "seconds") return gucciString
def rando(): #clip off the last semicolon then split up the separate querries reqs = request.query_string.decode('utf-8')[:-1].split(";") querry = [] #make a list of querries to quinterest for r in reqs: querry.append(formatreq(r)) if (len(querry) > 25): querry = querry[:25] questions = [] for q in querry: out = get("http://quinterest.org{}".format(q)).text out = BeautifulSoup(out, 'html.parser').find_all(attrs={"class": "row"}) out.pop(0) for e in out: #insert the query and replace button at the end of the question querystr = q[23:] querystr = sub('amount=[0-9]+', 'amount=1', querystr) span = BeautifulSoup( '<span class="subjTag" style="display:none"></span>').span repbutton = BeautifulSoup( '<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>' ).button span.string = querystr e.div.append(span) e.div.append(repbutton) questions.append(str(e)) questions = processQuestions(questions) return ("<br>".join(questions))
def getMoodle(self, name, id, paths, scriptDesc, sheetDesc): print("Checke %s" % (name)) loginurl = 'https://elearning2.uni-heidelberg.de/login/index.php' loginpayload = self.moodlePayload url = 'https://elearning2.uni-heidelberg.de/course/view.php?id=%s' % (id) scriptPath = paths['scripts'] excercisePath = paths['sheets'] miscPath = paths['misc'] self.s.post(loginurl, loginpayload) weeks = BeautifulSoup(self.s.get(url).content, "html.parser").find("ul", {"class": "weeks"}).findAll("div", {"class": "content"}) weeks.pop(0) for section in weeks: for part in section.findAll("li", {"class": "activity resource modtype_resource "}): text = part.find("span", {"class": "instancename"}).text if(text == ""): break if(text.endswith(" Datei")): text = text[:-6] r = self.s.get(part.find("a").get("href")) if(text.startswith(scriptDesc)): self.download_pdf(r.url, scriptPath, text) elif(text.startswith(sheetDesc)): self.download_pdf(r.url, excercisePath, text) else: self.download_pdf(r.url, miscPath, text)
def parse_items(self, page): match = re.findall('</table>(<table.+?<th>Item.+?</table>)', page, flags=re.S) if not match: return None rows = BeautifulSoup(match[0]).find_all('tr') rows.pop(0) # first header row rows.pop(0) # second header row i = "" while rows: data = rows.pop(0).find_all('td') item_number = int(data[0].get_text(strip=True)) item_description = data[1].get_text(' ', strip=True) item_catalog_family = data[2].get_text(' ', strip=True) item_quantity = float(data[3].get_text(strip=True).replace(',','')) item_uom = data[4].get_text(' ', strip=True) try: item_unit_price = float(data[5].get_text(strip=True).replace(',','')) except: item_unit_price = None i += item_description + " " + item_catalog_family + "\n" self.items.append(dict(item_number=item_number, item_description=item_description, item_catalog_family=item_catalog_family, item_quantity=item_quantity, item_uom=item_uom, item_unit_price=item_unit_price)) if rows: # sometimes item tables don't have a hanging last row rows.pop(0) # every second row is useless # print items self.items_text += i
def get_options_dates(): html = get_data('http://www.nseindia.com/live_market/dynaContent/live_watch/fxTracker/optChainDataByExpDates.jsp?symbol=USDINR&instrument=OPTCUR') optDates = SoupStrainer('select', {'id': 'expirydate'}) data = BeautifulSoup(html, 'html.parser', parse_only=optDates).find_all("option") data.pop(0) dates = [] for i in data: dates.append(i.get_text()) return json.dumps({'options_dates':dates})
def _handle_screening_questions(self, answer_questions: bool, collect_q_and_a: bool, wait=10) -> None: for _ in range(10): try: self._select_resume() if collect_q_and_a or answer_questions: questions = BeautifulSoup( self._browser.page_source, 'lxml').findAll(class_=compile_regex('Questions')) if questions: questions.pop(0) for div in questions: labels = div.findAll('label') if not labels: self._select_continue(wait) continue question_found = labels.pop(0).get_text()\ .replace('(optional)', '').strip() if not question_found: self._select_continue(wait) continue select = div.findAll('select') if select: for element in select: labels = element.findAll('option') answers_found = self._get_answers_set( labels) if not answers_found: self._select_continue(wait) break if answer_questions: self._answer_question( div, question_found, answers_found) else: answers_found = self._get_answers_set(labels) if not answers_found: self._select_continue(wait) if answer_questions: self._answer_question( div, question_found, answers_found) if collect_q_and_a: if question_found in self._q_and_a: self._q_and_a[question_found].update( answers_found) else: self._q_and_a[ question_found] = answers_found self._select_continue(wait) except TimeoutException: break except NoSuchElementException: print('NoSuchElementException encountered!') break return None
def all_url(self, url): html = requests.get(url) all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') all_a.pop(0) for a in all_a: title = a.get_text() print('开始保存:', title) path = str(title).replace('?', '_') self.mkdir(path) href = a['href'] self.html(href)
def download_table(table_url): """Returns a BeautifulSoup ResultSet with all tables from the WikiCommons Hanzi decomposition project. Each table is contained in a <pre> tag.""" # get a list of all <pre>-elements print('Downloading from {}...'.format(table_url), end='') decomp_html = download(table_url).text print('Done.') decomp_soup = BeautifulSoup(decomp_html, 'html.parser').find_all('pre') # remove first part that describes the table if re.search(r'1\.[^2]+2\.', decomp_soup[0].string, re.DOTALL): decomp_soup.pop(0) return decomp_soup
def all_url(self, url): html = request.get(url, 3) # html = Download.get(self, url 3) all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') all_a.pop(0) for a in all_a: title = a.get_text() print(u'开始保存: ', title) path = str(title), replace("?", '_') self.mkdirs(path) href = a['href'] self.html(href)
def all_url(self, url): html = self.request(url) ##调用request函数把套图地址传进去会返回给我们一个response all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') # 页面更改 多了一个早期图片 需要删掉(小伙伴们 可以自己尝试处理一下这个页面) all_a.pop(0) # 上面是删掉列表的第一个元素 for a in all_a: title = a.get_text() print(u'开始保存:', title) ##加点提示不然太枯燥了 path = str(title).replace( "?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 self.mkdir(path) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! href = a['href'] self.html( href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦!
def upcoming(): # curl -d ajax=true -d mod=queue http://www.animenfo.com/radio/nowplaying.php page = requests.post(API_URL, data={"ajax": "true", "mod": "queue", "togglefull": "true"}) results = BeautifulSoup(page.text).findAll("tr") results.pop() songs = [] for row in results: row = "".join(row.findAll(text=True)) if row.strip() == "": continue row = BeautifulStoneSoup(row, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) row = row.__str__().strip() row = re.sub("\s+", " ", row) songs.append(row) return songs
def parse_actions(self, page): match = re.findall('<th>Otras Acciones de la Convocatoria(.*?)</table>', page, flags=re.S) if not match: return None self.action_html = match[0] rows = BeautifulSoup(match[0]).find_all('td') self.actions = [] self.last_action = '' self.last_action_on = None n = 0 while rows: n += 1 row = rows.pop(0) if row.span: row.span.extract() a = row.u.extract().get_text() if row.u else None row_text = row.get_text(' ', strip=True) row_text = re.sub('informado el d.a', '', row_text) row_text = re.sub(r'el (\d+/\d+/\d+) a las (\d+:\d+)', r'\1 \2', row_text) match = re.findall('(.*?)(\d+/\d+/\d\d\d\d \d+:\d+)(.*)', row_text, flags=re.S) if match: try: d = datetime.datetime.strptime(match[0][1], '%d/%m/%Y %H:%M') except ValueError: d = None t = (match[0][0].strip(' ,') + ' ' + match[0][2].strip(' ,')).strip() if not self.last_action_on or d > self.last_action_on: self.last_action_on = d self.last_action = a + " " + t self.actions.append(dict(action_number=n, action_date=d, action_name=a, action_text=t)) else: match = re.findall('(NOTIFICACI.N ELECTR.NICA) (.*)', row_text) if match: d = None try: # print "parse_actions(): row.a['onclick'] =", row.a['onclick'], "type", type(row.a['onclick']) # print "parse_actions(): entering match" m = re.search('(\d+).+?(\d+)', row.a['onclick']) # print "parse_actions(): after match" # print "Found notificacion with", m.group(1), "and", m.group(2) uri = SEACE.build_notificacion_uri(m.group(1), m.group(2)) # print "parse_actions(): get uri", uri doc = SEACE.get_page(uri) if doc: ds = re.search('(\d+/\d+/\d+ \d+:\d+)', doc) # print "parse_actions(): matched", ds.group(1) if ds: d = datetime.datetime.strptime(ds.group(1), '%d/%m/%Y %H:%M') except: pass a = match[0][0] t = match[0][1] if not self.last_action_on or d > self.last_action_on: self.last_action_on = d self.last_action = a + " " + t self.actions.append(dict(action_number=n, action_date=d, action_name=a, action_text=t)) else: self.actions.append(dict(action_number=n, action_date=None, action_name=a, action_text=row_text))
def all_url(self, url): # html = self.request(url) html = request.get(url, 3) all_a = BeautifulSoup(html.text, 'lxml').find('div', attrs={'class': 'all'}).find_all('a') # 页面上多了一个早期图片,删除了 all_a.pop(0) for a in all_a: title = a.get_text() print('开始保存: ', title) self.title = title path = str(title).replace(":", "") self.mkdir(path) href = a['href'] self.url = href #将页面地址保存到self.url中 if self.meizitu_collection.find_one({'主题页面': href}): print(u'这个页面已经爬取过了') else: self.html(href)
def list_of_study(self)->List[List[str]]: '''Projectについての情報を取得する 以下の形式で出力 -------------------------------------- SRPxxxxxx PRJNAxxxxxxx Short Introduction about this project! Abstract: about this project! -------------------------------------- Returns ------- List[List[str]] それぞれのProjectの情報は上記の形式の文字列として管理 され、この返却値のリストの一つの要素として構成される。 補足: このリストにはself._bio_pjt_urlで検索された全てのProject の情報を管理する。 ''' ht = requests.get(self._bio_pjt_url) soup = BeautifulSoup(ht.content,"html.parser").find_all("tr") soup.pop(0) #skip header srp = "" #type:str bpjid = "" #type:str title = "" #type:str abst = "" #type:str list = [] #List[List[str]] for i in soup: title = (i.find_all("td")[2]).text.strip() href = i.find("a").get("href") #href = "?study=SRP~" srp = href.split('=')[1] bpj = BioProject(srp) #type:BioProject abst = bpj.abstract() bpjid = bpj.bioproject_id() list.append([srp,bpjid,title,abst]) return list
def get_uzh_menu(): # UZH URL actually has the weekday in german locale.setlocale(locale.LC_ALL, "de_CH.utf-8") curr_day = str(calendar.day_name[(int(NOW.strftime("%w")) + 6) % 7]).lower() if is_lunchtime(): url = "http://www.mensa.uzh.ch/de/menueplaene/zentrum-mensa/{}.html" else: url = "http://www.mensa.uzh.ch/de/menueplaene/zentrum-mercato-abend/{}.html" r = requests.get(url.format(curr_day)) if not UZH_MENSA_NOMEAL_STR in r.text: return "*Cheap mensa:*\nNo UZH menu available for this day!\n\n" menu_div = BeautifulSoup(r.text, "html.parser").findAll("div", {"class": "text-basics"}) menu_div.pop(0) return "*Cheap mensa:*\n" + uzh_parse_table(menu_div)
def get_recipe_list_by_ids(ids): """ return a list of dict objects, each a drink recipe with its details""" recipes = [] for drink_id in ids: recipe = urlopen(id_lookup_page + str(drink_id)) recipe = BeautifulSoup(recipe, "html5lib") recipe = loads(recipe.body.string)['drinks'][0] print("scraped recipe for drink: " + recipe['strDrink']) # remove unnecessary keys recipe.pop('strVideo') recipe.pop('strIBA') recipe.pop("strGlass") # iterate over a shallow copy because you can't delete while iterating for key, val in list(recipe.items()): if ("Ingredient" in key or "Measure" in key) and recipe[key] == "": # delete empty entries del recipe[key] recipes.append(recipe) return recipes
def get_submits(problem_code, contest_code, page_no="1"): # Returns the submissions as a dict with the relevant data-id as the key site_root = "https://www.codechef.com" try: print_info("Trying connection") raw = urllib.request.urlopen(site_root + "/ssubmission/prob?page=" + page_no + "&pcode=" + problem_code + "&ccode=" + contest_code).read() except urllib.error.HTTPError as e: #We retry forever in case of 503 print_error("HTTPError encountered with status code " + str(e.code)) if e.code == 503: return get_submits(problem_code, contest_code, page_no) else: print_error("""Check problem_code, contest_code or your internet connection""") return {} print_info("Connection successful") raw = raw.decode('utf-8') submission_html = json.loads(raw)["content"] submit_list = BeautifulSoup(submission_html, "html.parser").findAll("tr") submit_list.pop(0) final_data = [] for sub in submit_list: sub_dict = {} sub_tag = sub.findAll("td") if len(sub_tag) != 5: print_error("""Check problem_code, contest_code or your internet connection""") break sub_dict["user"] = sub_tag[0].text sub_dict["score/time"] = sub_tag[1].text sub_dict["mem"] = sub_tag[2].text sub_dict["lang"] = sub_tag[3].text sub_dict["solution"] = site_root + sub_tag[4].findAll("a")[0]["href"] final_data.append(sub_dict) return final_data
def renew_proxy_list(): proxies_page = requests.get('https://free-proxy-list.net/').text trs_list = BeautifulSoup(proxies_page, 'lxml').find( 'table', id='proxylisttable').find('tbody').find_all('tr') proxies_list = "" for i in range(10): ip = trs_list.pop(0).find('td') port = ip.find_next_sibling() proxy = 'http://' + ip.text + ':' + port.text + '\n' proxies_list += proxy proxies_list = proxies_list[:-2] with open('proxies.txt', 'w') as proxies: proxies.write(proxies_list) proxies.close()
def parse_calendar(self, page): # print "Parsing calendar...", self.events = [] match = re.findall('colspan=3>Calendario\s+(.+?)</table>', page, flags=re.S) if not match: # print "not found." return False rows = BeautifulSoup(match[0]).find_all('tr') headers = rows.pop(0).find_all('th') headers.pop(0) # no use for 1st column 1st row. for r in rows: rowdata = r.find_all('td') tipo = rowdata.pop(0).string # 1st column of row contains event type for h in headers: hito = re.sub("Fecha ", "", h.string) match = re.search('(?P<fecha>\d+/\d+/\d+ \d+:\d+)', rowdata.pop(0).string) fecha = datetime.datetime.strptime(match.group('fecha'), "%d/%m/%Y %H:%M") if match else None self.events.append(dict(tipo=tipo, hito=hito, fecha=fecha)) return True
def main(date): dishes = [] menus = BeautifulSoup(get_website(), 'html5lib').find_all( 'table', class_='Liste') #Note: Only html5lib can fix this broken html code. days = BeautifulSoup(get_website(), 'html5lib').find_all('div', class_='KopfLeiste_o') if len(menus) != len(days): return ['Leider verstehe ich den Speiseplan heute nicht.'] date = date.replace(datetime.datetime.now().strftime('%d.%m.%Y'), 'Heute') date = date.replace((datetime.date.today() + datetime.timedelta(days=1)).strftime('%d.%m.%Y'), 'Morgen') for day in days: m = menus.pop(0) if date in day.find('div').string: for dish in m.find_all('tr'): if dish.find(class_='Speise'): title = dish.find('td', class_='Speise').text.strip() price = dish.find('td', class_='PreisG').text.strip() # annotation = dish.find('td', class_='Nr').text.strip() annotation = MEAT #default #is it a well-balanced meal and recommended for healthy eating? if str(dish.find( 'td', class_='PreisG')).find("apfel-klein.png") != -1: annotation = "%s%s" % (annotation, WELLBALANCEDMEAL) this_dish = '%s %s: *%s*' % (annotation, title, price) dishes.append(this_dish) return beautify_menu(dishes) or [ 'Leider kenne ich keinen Speiseplan für diesen Tag.' ]
def get_corp_name(self, response): # print(response.text) tables = BeautifulSoup(response.text, 'lxml').find_all("table", background='images/zj08.gif') tr = tables.pop() # for td in tds: teams = tr.find_all("tr") for team in teams: # print(team) houseDeal = team.find_all("td") # for deal in houseDeal: # print(deal.text) item = Xinsa1Item() item['corp_name'] = houseDeal[0].text.strip() item['book_num'] = houseDeal[1].text item['order_num'] = houseDeal[2].text item['amount'] = houseDeal[3].text item['ts'] = houseDeal[4].text print(item) yield item
def getDom(pageurl, charset): if charset is None: charset = 'utf-8' soup = BeautifulSoup(pageurl, 'html.parser', from_encoding=charset) #去除特定的head、script、style、img、input标签 [ body.extract() for body in soup(['head', 'img', 'script', 'style', 'input']) ] # #去除注释 for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() pass #将soup中的文本提取出来,并存储到body数组中 soup = soup.text.strip().lstrip().rstrip().split() #获取当前日期和具体时间,以便提出soup数组中出现的无效数据(当前系统时间) currentDate = time.strftime('%Y-%m-%d', time.localtime(time.time())) #处理日期格式,例:2017-04-20---->2017-4-20 currentDate1 = currentDate[0:5] + currentDate[6:] #获取当前具体时间,以便提出soup数组中出现的无效数据(当前系统时间) currentTime = time.strftime('%H:%M', time.localtime(time.time())) #处理日期格式,例:23:58---->23:5,防止因为程序运行而导致的时间误差 currentTime1 = currentTime[0:4] #剔除soup数组中无效字符串,减少干扰 #剔除soup数组中'copyright'后半部分的版权内容 #剔除soup数组中无效的年月。如"1999",剔除“2001-2007”格式的时间字符串 #剔除soup数组中类似于”最后登录:2017-04-20 23::55“的无效时间 #剔除soup数组中出现的当前系统时间字符串 re0 = re.compile(r'.*Copyright.*') re1 = re.compile(r'.*((19\d{2}\D)|(\d{4}-\d{4}\D)).*') re2 = re.compile(r'(^|.*)注册.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}$') re3 = re.compile(r'.*(' + currentDate1 + '|' + currentDate + ').*') re4 = re.compile(r'.*' + currentTime1 + '\d.*') re5 = re.compile(r'^最后.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}.*') for item in soup: #剔除soup数组中以":"、“:”结尾的文本字符串 if item.endswith(":"): soup.pop(soup.index(item)) pass if item.endswith(":"): soup.pop(soup.index(item)) pass #剔除soup数组中"|"和“»”和“›”文本字符串 if '|' in soup: soup.pop(soup.index('|')) pass if '>' in soup: soup.pop(soup.index('>')) pass if '»' in soup: soup.pop(soup.index('»')) pass if '›' in soup: soup.pop(soup.index('›')) pass if re0.match(item): CopyrightIndex = soup.index(item) - 5 while CopyrightIndex <= len(soup) - 1: popItem = soup.pop(CopyrightIndex) pass pass #剔除不规则时间 if re1.match(item): #获得主体内容中re1指定格式匹配到的文本所在的下标timeIndex timeIndex = soup.index(item) #,剔除不规则不正常时间数据,防止被重复遍历 soup.pop(timeIndex) pass if re2.match(item): #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex timeIndex = soup.index(item) #,剔除不规则不正常时间数据,并让该下标下的值置为'|',防止被重复遍历 soup.pop(timeIndex) pass #剔除当前系统时间 if re3.match(item): if item in soup: if re4.match(soup[soup.index(item) + 1]): #剔除当前系统时间 soup.pop(soup.index(item) + 1) #去除当前系统日期 soup.pop(soup.index(item)) pass pass pass if re5.match(item): if item in soup: #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex timeIndex = soup.index(item) #,剔除不规则不正常时间数据,并让该下标下的值置为'|',防止被重复遍历 soup.pop(timeIndex) pass pass pass #返回经过数据预处理的soup数组 return soup pass
'User-Agent': 'Mozilla/5.0', 'Authorization': "Bearer {0}".format(token) }).text except Exception: continue # Creates a BeautifulSoup object with the retrieved HTML, then does find to get result set listings = BeautifulSoup( response, 'html.parser', parse_only=SoupStrainer("script", attrs={'type': 'text/javascript'})).find_all("script") if listings: product_listings = [] listings.pop(0) for listing in listings: try: result = listing.contents[0].split('\r\n') this_listing = {} # the string manipulation of these items assumes standard format where the desired item appears after a colon # and is formatted as "<desired item>", html unescape takes care of escape sequences, however since the # content is in a string format it leaves behind the leading \\, so this also assumes that no strings will # purposefully have a \\ in them, and removes all instances of \\ from strings for item in result: if item.find('"set_name":') > 0: this_listing['set_name'] = html.unescape( item.strip().split(':')[1].strip()[1:-2]).replace( '\\', '') elif item.find('"price":') > 0: this_listing['price'] = float(
driver.switch_to.alert.accept() except: continue ActionChains(driver).move_by_offset(680, 290).click().perform() ActionChains(driver).move_by_offset(740, 500).click().perform() driver.get( driver.find_element_by_xpath( r'//*[@id="headDiv"]/ul/li[5]/ul/li[6]/a').get_attribute("href")) driver.find_element_by_xpath(r'/html/body/h2/a').click() Select(driver.find_element_by_xpath(r'//*[@id="ddlXN"]')).select_by_value( "2018-2019") soup = BeautifulSoup(driver.page_source, "html.parser") tbody = soup.find_all("tbody") raw_items = BeautifulSoup(str(tbody[0]), "html.parser").find_all("tr") raw_items.pop(0) with open("ahu.json", "w", encoding="utf-8") as f: for item in raw_items: sp = BeautifulSoup(str(item), "html.parser") list = sp.find_all("td") course_code = str(list[1]) course_code = course_code[4:11] course_name = str(list[2]) course_name = course_name[findSubstring(course_name, ">", 2) + 1:-9] credit = str(list[6]) credit = credit[findSubstring(credit, ">", 1) + 1:-5] teaching_time = str(list[8]).replace('\n', '').replace(' ', '') teaching_time = teaching_time[teaching_time.find("周"):-12] f.write(r'{"course code":"' + course_code + r'","course name":"' + course_name + r'","credit":"' \ + credit + r'","teaching time":"' + teaching_time + r'"}' + '\n')
async def on_message(message): if message.content.startswith("!cwk"): channel = client.get_channel(message.channel) global number try: message.content = int(message.content[4:]) except ValueError: await message.channel.send("数値ではないので回数変更できませんっ!") return if message.content >= int(1001): await message.channel.send("1000以上の数値をやらせようとしないでくださいっ!") return else: number = message.content await message.channel.send("叩き起こすメンションの回数を" + str(number) + "に変更しましたっ!") return if message.content.startswith("!awk"): if "モデレーターさん" in [ users_role.name for users_role in message.author.roles ]: conf_on() await message.channel.send("Wakeup可能ですっ!") if message.content.startswith("!dwk"): if "モデレーターさん" in [ users_role.name for users_role in message.author.roles ]: conf_off() await message.channel.send("Wakeup無効ですっ!") if message.content.startswith("!whichwk"): if "on" in wu: await message.channel.send("Wakeup可能ですっ!") else: await message.channel.send("Wakeup無効ですっ!") if message.content.startswith('whoami'): channel = client.get_channel(message.channel) llip = ([ l for l in ( [ ip for ip in socket.gethostbyname_ex(socket.gethostname())[2] if not ip.startswith("127.") ][:1], [[(s.connect(('8.8.8.8', 53)), s.getsockname()[0], s.close()) for s in [socket.socket(socket.AF_INET, socket.SOCK_DGRAM)] ][0][1]]) if l ][0][0]) await message.channel.send("私は" + socket.gethostname() + "だよっ☆" + "\n" + "ローカルipは" + llip + "だよっ☆") if message.content.startswith("廃人"): channel = client.get_channel(message.channel) res = requests.get('https://status.slack.com/') soup = BeautifulSoup(res.text, 'html5lib') c = soup.find_all('p', class_="bold") s = soup.find_all('p', class_="tiny") c = c[1:4] s = s[5:8] d = str(c.pop(0)) + "\n" + str(s.pop(0)) d = re.sub(r'<a(.+)</a>', "", d) d = re.sub(r'</p>', "", d) d = re.sub(r'<p class=\"tiny\">', "", d) d = d.replace(" ", " ") d = re.sub(r'<p class=\"bold\">', "", d) await message.channel.send(d) for i in range(2): d = rep(c, s) await message.channel.send(d) if message.content.startswith('オールデリート'): if "モデレーターさん" in [ users_role.name for users_role in message.author.roles ]: id = "<@366844805470486528>" await message.channel.send(id + "宛。" + "緊急終了実行。") await message.channel.send("実行:" + "<@" + str(message.author.id) + ">") await client.logout() os.kill(os.getpid(), 11) if message.content.startswith("ipcall"): channel = client.get_channel(message.channel) res = requests.get("http://inet-ip.info/ip") await message.channel.send(res.text) if message.content.startswith("プロセスを殺す"): channel = client.get_channel(message.channel) id = "<@366844805470486528>" await message.channel.send(id + "_" + "要請によりプロセスを緊急終了します。") await message.channel.send("実行:" + "<@" + str(message.author.id) + ">") await client.logout() os.kill(os.getpid(), 11) if message.content.startswith("!whichfa"): await message.channel.send(fa) if message.content.startswith("!afa"): if "モデレーターさん" in [ users_role.name for users_role in message.author.roles ]: fa_conf_on() await message.channel.send("再起動のブロックを解除しました。") if message.content.startswith("!dfa"): if "モデレーターさん" in [ users_role.name for users_role in message.author.roles ]: fa_conf_off() await message.channel.send("再起動をブロックしました。") if message.content.startswith("フォースアゲイン"): channel = client.get_channel(message.channel) if "off" in fa: await message.channel.send( "作業中につき再起動をブロックしています。botが暴走している場合はモデレーターへメンションしてください。") tar = discord.utils.get(message.guild) print(tar) #dm = await tar.create_dm() #try: #await dm.send("FA失敗"+"\n"+"実行:"+message.author) #except discord.errors.Forbidden: #pass else: adminID = "<@366844805470486528>" SecondAdminID = "<@529644095027806208>" await message.channel.send(adminID + "\n" + SecondAdminID + "\n" + "再起動します") await message.channel.send("実行:" + "<@" + str(message.author.id) + ">") await client.logout() os.system("reboot") if message.content.startswith("今日の大空お天気"): channel = client.get_channel(message.channel) soup = requests.get("https://www.aikatsu.com/onparade/") soup = BeautifulSoup(soup.text, 'html5lib') check = soup.find_all("div", class_='txt_detail-date') if check == []: soup = soup.find_all('dd', class_="txt_detail") soup = str(soup.pop(0)) soup = soup.replace('<dd class="txt_detail">', '').replace('</dd>', '') print(soup) print(type(soup)) else: check = soup.find_all("div", class_='txt_detail-date') soup = str(soup.pop(0)) soup = re.sub(r'<br(.+)</p>', "", soup) soup = soup.replace('<p>', '').replace('</p>', '') await message.channel.send(soup) if message.content.startswith("金沢地方の遅れ"): channel = client.get_channel(message.channel) soup = requests.get("https://trafficinfo.westjr.co.jp/hokuriku.html") soup = BeautifulSoup(soup.text, 'html5lib') check = soup.find_all("p", class_='gaiyo') if check == []: soup = soup.find_all('strong') soup = str(soup.pop(0)) soup = soup.replace('<strong>', '').replace('</strong>', '') await message.channel.send(soup) else: ls = soup.find_all('p', class_='gaiyo') while True: soup = ls.pop(0) soup = str(soup) soup = soup.replace('<p class="gaiyo">', '').replace('<br/>', '').replace('</p>', '') await message.channel.send(soup) if ls == []: break if message.content.startswith("近畿地方の遅れ"): channel = client.get_channel(message.channel) soup = requests.get("https://trafficinfo.westjr.co.jp/kinki.html") soup = BeautifulSoup(soup.text, 'html5lib') check = soup.find_all("p", class_='gaiyo') if check == []: soup = soup.find_all('strong') soup = str(soup.pop(0)) soup = soup.replace('<strong>', '').replace('</strong>', '') await message.channel.send(soup) else: ls = soup.find_all('p', class_='gaiyo') while True: soup = ls.pop(0) soup = str(soup) soup = soup.replace('<p class="gaiyo">', '').replace('<br/>', '').replace('</p>', '') await message.channel.send(soup) if ls == []: break if message.content.startswith("プロセス把握"): channel = client.get_channel(message.channel) global response response = str(subprocess.check_output(['ps', "aux"])) print(response) response.replace(' ', '\n') response = response[:2000] await message.channel.send(response) if message.content.startswith("neofetch"): channel = client.get_channel(message.channel) responce = str(subprocess.check_output(["neofetch"])) responce = str(responce[:2000]) await message.channel.send(responce) if message.content.startswith("naboon_chat"): channel = client.get_channel(message.channel) ID = "<@714406627603644489>" response = subprocess.check_output( ['ojichat', "なぼ"]).decode(encoding='utf-8').rstrip() response = ID + response print(response) await message.channel.send(response) if not len(message.attachments) == 0: if message.author.bot == True: return RN = None channel = client.get_channel(message.channel) await message.channel.send('受け付けました') filename = message.attachments[0].filename download_img(message.attachments[0].url, filename) file_path = filename read = decode(Image.open(file_path)) try: path = read[0][0].decode('utf-8', 'ignore') except IndexError: await message.channel.send("Error! QRコードが検出されませんでした。") os.remove(filename) return print(path) print(type(path)) if path is None: await message.channel.send("Error! QRコードが検出されませんでした。") os.remove(filename) return print(path) if "http://dcd.sc/n2" in path: target_url = path r = requests.get(target_url) soup = BeautifulSoup(r.text, 'html5lib') try: NR = soup.find("dd", class_="cardNum").get_text() + " " + soup.find( "dd", class_="cardName").get_text() print(NR) except AttributeError: RN = "カード名取得失敗です。学生証を読み込んだ事またはリダイレクトの設定間違えだと思われます。" elif "http://dcd.sc/j2" in path: target_url = path r = requests.get(target_url) soup = BeautifulSoup(r.text, 'html5lib') try: NR = soup.find( "div", class_="dress-detail-title clearfix").get_text() print(NR) except AttributeError: RN = "カード名取得失敗です。学生証を読み込んだ事またはリダイレクトの設定間違えだと思われます。" elif "http://dcd.sc/n3" in path or "http://dcd.sc/n1" in path: NR = "学生証です。" print(NR) elif "http://dcd.sc/n0" in path: NR = "アイドルカードまたはフルコーデカードです。" print(NR) path = get_shortenURL(path) print(path) if path == "error": await message.channel.send( "Error! おそらくKyashなどのアプリ内でのみ使えるQRを送信しようとしていませんか?") os.remove(filename) path = card = r = None return else: await message.channel.send(NR) await message.channel.send(path) os.remove(filename) path = card = r = None if message.content.startswith('pid'): channel = client.get_channel(message.channel) await message.channel.send(pid) if message.content.startswith('!kill'): if str(pid) in message.content: if message.author.bot: return else: id = "<@366844805470486528>" await message.channel.send(id + "宛。" + "緊急終了実行。") await message.channel.send("実行:" + "<@" + str(message.author.id) + ">") await client.logout() os.kill(os.getpid(), 11) else: return if message.content.startswith("wakeup"): channel = client.get_channel(message.channel) for mem in message.mentions: for i in range(int(number)): a = int(mem.id) print(a) await message.channel.send("<@" + str(a) + ">" + "さん起きて!!!") if message.content.startswith("ski"): o = [] lis = [] ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)" \ "AppleWebKit/537.36 (KHTML, like Gecko)" \ "Chrome/60.0.3112.113" TU = "https://www.happo-one.jp/gelande/lift/" O = requests.get(TU, headers={"User-Agent": ua}) RE = BeautifulSoup(O.text, "html.parser") c = RE.find_all("td") for d in c: o.append(d.text) for i in range(20): for i in range(5): if o == []: break a = o.pop(0) b = o.pop(0) c = o.pop(0) d = o.pop(0) e = o.pop(0) A = a + " " + b + " " + c + " " + d + " " + e A = A.replace("last lift up", "") A = A.replace("last lift down", "") A = A.replace(" ", " ") A = A.replace(" ", " ") lis.append(A) await message.channel.send("[リフト名][運転開始時刻][運転終了時刻][備考]") for lis in lis: await message.channel.send(lis)
tree = BeautifulSoup(i['html'], "lxml") good_html = tree.prettify() title = [] elements = BeautifulSoup(good_html, "html.parser").find_all(class_="startup-link") for m, v in enumerate(elements): if m % 2 == 0: title.append((elements[m]['title']).encode('utf-8')) location = [] elements = BeautifulSoup( good_html, "html.parser").find_all(class_="column hidden_column location") if k == 0: elements.pop(0) for i in elements: newEle = i.find_all(class_="value") for j in newEle: newEle2 = j.find_all(class_="tag") if len(newEle2) == 0: location.append("-") else: location.append(((newEle2[0].text).encode('utf-8')).strip()) market = [] elements = BeautifulSoup( good_html,
#print(list2_of_course) #print(type(list2_of_course)) listofcourse = list() list_of_course = list_of_course.find_all("option") #convert beautifulsoup results to readable list while len(list_of_course) > 0: #if list_of_course[0].value=="": #print(list_of_course[0].string) stringtemp = list_of_course[0].string if stringtemp.find("20") == -1 and stringtemp.find("---") == -1: stringtemp = stringtemp.replace("\n", "") if stringtemp != "" and len(stringtemp) > 1: listofcourse.append(stringtemp) #else: #print(stringtemp) list_of_course.pop(0) print(listofcourse) print(len(listofcourse)) a = 0 for i in range(len(listofcourse)): print(i) #if browser.is_element_present_by_text(listofcourse[i]) : print(listofcourse[i]) browser.find_option_by_text(listofcourse[i]).first.click() browser.find_by_value('Load Class Schedule').first.click() #browser.windows.current=browser.windows[1] while len(browser.windows) > 1: for ii in browser.windows: if ii.url == "https://wish.wis.ntu.edu.sg/webexe/owa/AUS_SCHEDULE.main_display1": browser.windows.current = ii html_page = browser.html
import os import pandas import datetime import requests from bs4 import BeautifulSoup Response = requests.get("http://www.boxofficemojo.com/yearly/") RowData = BeautifulSoup(Response.text, "lxml").find( "table", attrs={"cellspacing": "1"}).find_all("tr") ColumnName = RowData.pop(0) ColumnName = [Item.text for Item in ColumnName] RowData = [list(Item.stripped_strings) for Item in RowData] DataFrame = pandas.DataFrame(RowData, columns=ColumnName) PathResult = os.path.abspath("Results") if not os.path.exists(PathResult): os.makedirs(PathResult) FileName = os.path.join(PathResult, "boxofficemojo.csv") DataFrame.to_csv(FileName, index=False) TextTarget("Save csv to {}".format(FileName)) after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1) TextTarget("The date after one week - {}".format(after_one_week.strftime("%Y/%m/%d"))) DataForm = {"StartStation": "977abb69-413a-4ccf-a109-0272c24fd490", "EndStation": "9c5ac6ca-ec89-48f8-aab0-41b738cb1814", "SearchDate": after_one_week.strftime("%Y/%m/%d"), "SearchTime": "14:00", "SearchWay": "DepartureInMandarin", "RestTime": "", "EarlyOrLater": ""} Response = requests.post( "https://www.thsrc.com.tw/tw/TimeTable/SearchResult", data=DataForm) RowData = BeautifulSoup(Response.text, "lxml").table.find_all(
def processStock(broswer, url, stockCode, stockName): # 开始请求的计时 time_request = time.time() # 获取页面 broswer.get(url) print(stockName, '[', stockCode, ']', ':Url-Request Time Cost', time.time() - time_request, 's') # 开始计时 time_start = time.time() # 切换iframe 以保证准确定位 try: broswer.switch_to.frame('dataifm') except Exception as ex: print(ex) return {}; # 获取财务报表所有的表的类型 sideNav = broswer.find_elements_by_xpath('//*[@class="newTab"]/li') # 要插入mongodb的大对象(对应一个股票的所有财报) insertData = { "name": stockName, "code": stockCode, "tables": [], } # 循环每一张财报;从主要指标表开始 for report_type in range(0, len(sideNav)): # 当前报表名称 report_statement_type = sideNav[report_type].text print('Current Finance Report Statement:', report_statement_type) # 导航栏中的当前链接 sheet_href = sideNav[report_type].find_element_by_tag_name('a') html = BeautifulSoup(broswer.page_source, 'lxml') left_div = html.select(".left_thead") # 各项指标 indicators = BeautifulSoup(str(left_div), 'lxml').select('th') indicators.pop(0) indicatorsNum = len(indicators) # for indicator in indicators: # print(indicator.text) # 数据表格<table> # 包含 报告期 # 包含报告期下的各个财务数据 right_div = html.select('.data_tbody') data_and_periods = BeautifulSoup(str(right_div), 'lxml') periods_table = data_and_periods.select('.top_thead') periods = BeautifulSoup(str(periods_table[0]), 'lxml').find_all('div', class_='td_w') # 数据表格 data_table = data_and_periods.select('.tbody') # 整个表格的数据 dataGrid = BeautifulSoup(str(data_table), 'lxml').select('tr') # 处理数据,存入数据库 data = [] # 处理每一行 for y in range(0, len(indicators)): for x in range(0, len(periods)): # 当前表格单元的文字 cell_text = (dataGrid[y].contents[x].text).strip() # 默认单位是:'' unit = '' # 默认值是小数:0.0 value = 0.0 if cell_text != '--' and cell_text != '': lastChar = list(cell_text)[-1] if lastChar == '亿' or lastChar == '万' or lastChar == '%': unit = lastChar lastTwoChars = cell_text[len(cell_text) - 2:len(cell_text)] if lastTwoChars == '万亿' or lastTwoChars == '千亿': value = float(cell_text[0:-2]) unit = lastTwoChars else: value = float(cell_text[0:-1]) obj = { "period": periods[x].text, "indicator": indicators[y].text, "text": cell_text, "value": value, "unit": unit } data.append(obj) # 插入当前一列 # 要插入mongodb 的报表对象 table = { "name": report_statement_type, "indicators": list(map(lambda x: x.text, indicators)), "periods": list(map(lambda x: x.text, periods)), "data": data } # 向大对象中加入当前报表数据 insertData['tables'].append(table) # 默认table还没有刷新 refreshFlag = False # 试图切换表格 try: # 如果不是最后一页,就切换表格! if report_type < len(sideNav) - 1: # 点击导航栏中的下一个链接 sideNav[report_type + 1].find_element_by_tag_name('a').click() time.sleep(0.1) while not refreshFlag: new_left_div = BeautifulSoup(broswer.page_source, 'lxml').select(".left_thead") nextIndicatorSum = BeautifulSoup(str(new_left_div), 'lxml').select('th') refreshFlag = indicatorsNum != len(nextIndicatorSum) except Exception as ex: print('出现问题了:', ex) finally: print(sheet_href.text, '输出完毕') # 打印结束信息 print(stockName, '[', stockCode, ']', ':Data-Handle Time Cost', time.time() - time_start, 's') print("=======================================") return insertData;
HREF = tag.get('href') if HREF != "/html/web.config" and HREF != "/": yearLinks.append(str(HREF)) for year in yearLinks: publicationYear = year.split('/')[2] if not os.path.exists("output/" + publicationYear): os.makedirs("output/" + publicationYear) months = BeautifulSoup( urllib2.urlopen(archiveRoot + year).read(), "html5lib").find_all('a') months.pop(0) for month in months: publicationMonth = month.get_text() if not os.path.exists("output/" + publicationYear + "/" + publicationMonth): os.makedirs("output/" + publicationYear + "/" + publicationMonth) days = BeautifulSoup( urllib2.urlopen(archiveRoot + month.get('href')).read(), "html5lib").find_all('a') days.pop(0) for day in days:
for url in quchong: print(url) xiazai_jiumei(url) urls = [] conn = pymysql.connect(host='192.168.0.131',user='******',passwd='123456',db='mypydb',charset='utf8') cur = conn.cursor() cur.execute("select url from jiumei") results = cur.fetchall() cur.close() conn.close() result = list(results) for r in result: urls.append("%s"%r) urls = list(set(urls)) while urls: url = urls.pop() print("重新下载:%s"%url) xiazai_jiumei_sql(url) try: conn = pymysql.connect(host='192.168.0.131',user='******',passwd='123456',db='mypydb',charset='utf8') cur = conn.cursor() cur.execute("select url from jiumeim") results = cur.fetchall() cur.execute("truncate jiumeim") cur.close() conn.close() result = list(results) for r in result: urls.append("%s"%r) urls = list(set(urls)) except:
def parse_results_rows(self, results_rows: BeautifulSoup) -> List[EMLODoc]: """Input is a list of HTML table rows with EMLO results, output is a list of EMLODoc objects.""" header_row = results_rows.pop(0) headers = parse_results_header(header_row) return [self.make_emlo_doc(results_row, headers) for results_row in results_rows]
HREF = tag.get('href') if HREF != "/html/web.config" and HREF != "/": yearLinks.append( str(HREF) ) for year in yearLinks: publicationYear = year.split('/')[2] if not os.path.exists("output/" + publicationYear): os.makedirs("output/" + publicationYear) months = BeautifulSoup(urllib2.urlopen(archiveRoot + year).read(), "html5lib").find_all('a') months.pop(0) for month in months: publicationMonth = month.get_text() if not os.path.exists("output/" + publicationYear + "/" + publicationMonth): os.makedirs("output/" + publicationYear + "/" + publicationMonth) days = BeautifulSoup(urllib2.urlopen(archiveRoot + month.get('href')).read(), "html5lib").find_all('a') days.pop(0) for day in days: publicationDay = day.get_text()
def compile_paragraph(cls, paragraph, rels_soup, stringset, is_rtl=False): text_elements = paragraph.find_all(cls.TEXT_ELEMENT_TAG) if not text_elements: return txid = paragraph.attrs.get('txid') if not txid: return if stringset.get(txid, None) is None: return translation_string = stringset[txid].string escaped_translation_string = cls._escape_xml(translation_string) translation_soup = BeautifulSoup( u'<wrapper>{}</wrapper>'.format(escaped_translation_string), 'xml', ).find_all(text=True) added_hl_text_elements = defaultdict(list) deleted_hl_text_elements = defaultdict(list) empty_text_element = None elements_for_removal = [] last_element = None leading_spaces = 0 # First of all try to replace each element translation # this is the happiest path if is_rtl: cls.set_rtl_orientation(paragraph) for index, text_element in enumerate(text_elements): text = six.text_type(text_element.text) # detect text elements that contain no text # and remove leading whitespace from the next string if not text.strip(): leading_spaces = len(text) - len(text.strip()) empty_text_element = text_element continue last_element = text_element hyperlink_url = cls.get_hyperlink_url(text_element, rels_soup) # the text parts of the translation are less that the # text parts of the document, so we will just remove # any exceeding part from the document if len(translation_soup) == 0: elements_for_removal.append(text_element) continue else: translation_part = translation_soup.pop(0) translation = six.text_type(translation_part) translation_hyperlink_url = cls.get_translation_hyperlink( translation_part) if not translation[:leading_spaces].strip(): translation = translation[leading_spaces:] leading_spaces = 0 else: if empty_text_element: elements_for_removal.append(empty_text_element) empty_text_element = None text_element.clear() text_element.insert(0, translation) # Edit in place hyperlink url if hyperlink_url and translation_hyperlink_url: cls.set_hyperlink_url(text_element, rels_soup, translation_hyperlink_url) else: if hyperlink_url: deleted_hl_text_elements[hyperlink_url]\ .append(text_element) elif translation_hyperlink_url: added_hl_text_elements[translation_hyperlink_url]\ .append(text_element) # the text parts of the translation are more that the # text parts of the document, so we will compress the # remaining translation parts into one string if len(translation_soup) > 0: translation = last_element.contents[0] + \ "".join([six.text_type(t) for t in translation_soup] ) last_element.clear() last_element.insert(0, translation) if len(added_hl_text_elements) == len(deleted_hl_text_elements)\ and len(added_hl_text_elements) > 0: cls.swap_hyperlink_elements(added_hl_text_elements, deleted_hl_text_elements) for text_elements in six.itervalues(deleted_hl_text_elements): for text_element in text_elements: cls.remove_hyperlink(text_element) for url, text_elements in six.iteritems(added_hl_text_elements): for text_element in text_elements: cls.create_hyperlink_url(text_element, rels_soup, url) for element in elements_for_removal: cls.remove_text_element(element)
def compile(self, template, stringset, **kwargs): stringset = {string.string_hash: string for string in stringset} docx = DocxFile(template) soup = BeautifulSoup(docx.get_document(), 'xml') rels_soup = BeautifulSoup(docx.get_document_rels(), 'xml') for paragraph in soup.find_all('w:p'): text_elements = paragraph.find_all('w:t') if not text_elements: continue txid = paragraph.attrs.get('txid') if not txid: continue if stringset.get(txid, None) is None: continue translation = stringset[txid].string translation_soup = BeautifulSoup( u'<wrapper>{}</wrapper>'.format(translation), 'xml').find_all(text=True) leading_spaces = 0 for index, text_element in enumerate(text_elements): text = six.text_type(text_element.text) # detect text elements that contain no text # and remove leading whitespace from the next string if not text.strip(): leading_spaces = len(text) - len(text.strip()) continue else: hyperlink_url = self.get_hyperlink_url( text_element, rels_soup) # the text parts of the translation are less that the # text parts of the document, so we will just remove # any excessing part from the document if len(translation_soup) == 0: if hyperlink_url: text_element.find_parent('w:hyperlink').decompose() else: text_element.decompose() continue translation_part = translation_soup.pop(0) translation = six.text_type(translation_part) if not translation[:leading_spaces].strip(): translation = translation[leading_spaces:] leading_spaces = 0 # the text parts of the translation are more that the # text parts of the document, so we will compress the # remaining translation parts into one string if index == len(text_elements) - 1 and len( translation_soup) > 0: translation = "".join( [translation] + [six.text_type(t) for t in translation_soup]) if hyperlink_url: # attempt to find a parent containing `href` attribute # in order to extract the potential modified url. self.set_hyperlink_url( text_element, rels_soup, getattr( translation_part.find_parent(attrs={'href': True}), 'attrs', {}).get('href', hyperlink_url)) text_element.clear() text_element.insert(0, translation) docx.set_document(six.text_type(soup)) docx.set_document_rels(six.text_type(rels_soup)) result = docx.compress() docx.delete() return result