def get_car_vin(html): if html: try: if(len(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(1) > div.row.mt_5 > div.col-md-4.car-details-sidebar > div > ul > li:nth-child(2) > strong"))): return str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(1) > div.row.mt_5 > div.col-md-4.car-details-sidebar > div > ul > li:nth-child(2) > strong")[0].text) except Exception as e: print('Can\'t get VIN number. Reason %s.' % e)
def get_car_title(html): if html: try: if(len(html.find("h2.tit_style2"))): return clear_car_name(ko_translate(rm_new_line(str(html.find("h2.tit_style2")[0].text)), "en")) except Exception as e: print('Can\'t get title. Reason %s.' % e) return ""
def get_car_mark(html): if html: try: if(len(html.find("#body > section.con_top.gray-bg_fin > div.container-fluid.wide_area.mt_1.car_view_check_area > div > div > div > table > tbody > tr:nth-child(7) > td:nth-child(2)"))): return ko_translate(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div.container-fluid.wide_area.mt_1.car_view_check_area > div > div > div > table > tbody > tr:nth-child(7) > td:nth-child(2)")[0].text)),"en") except Exception as e: print('Can\'t get car mark. Reason %s.' % e) return ""
def get_car_price(html): if html: try: if(len(html.find("strong.i_comm_main_txt2"))): price = int(re.sub("\,", "", html.find("strong.i_comm_main_txt2")[0].text)) return int(price*10000/int(config['SELLCAR']['USD'])) except Exception as e: print('Can\'t get car price. Reason %s.' % e) return ""
def getRelatedVideos(html): end = 0 while True: start = html.find('video-list-item related-list-item', end) if start == -1: break (start, end) = search(html, 'href="', '"', start) start = html.index('v=', start) + len('v=') amp_idx = html.find('&', start, end) if amp_idx != -1: end = amp_idx yield html[start:end]
def _get_detailed_page(self, url): # 由于网站比较特殊,故在这里完成get和parse,并返回一个json_str eid = url.split('/')[-1] text = self.get_page_by_GET(url) html = bs4.BeautifulSoup(text, 'lxml') info = html.find('', {'class': 'info'}) # title = info.find('', {'class': 'title'}).string.strip() address = info.find('', {'class': 'address'}).string.strip() buy_button = html.find('', {'class': 'view_sigpnup'}).a buy_href = buy_button.attrs['href'] buy_type = buy_button.string.strip() _LOGGER.debug(f'[_get_detailed_page] url: {url}') _LOGGER.debug( f'[_get_detailed_page] address: {address}, buy_href: {buy_href}, buy_type: {buy_type}, eid: {eid}' ) try: urls_and_events = self.__various_get_urls_and_events( address, buy_href, buy_type, html, eid) except Exception as e: _LOGGER.error(e) return None if urls_and_events is None: return None urls, events = urls_and_events # get page for building infos(or price infos) building_page_lsit = utils.async_get_pages_by_GET(urls) # parse building infos for price infos(or just return) page_lsit = [ self.__various_parse_for_price_infos(address, buy_type, page, eid, events[i]['event_id']) for i, page in enumerate(building_page_lsit) ] detailed_infos = [] for i, event in enumerate(events): date = event['date'] time = event['time'] page = page_lsit[i] prices = self.__various_get_prices(address, buy_type, page) if not prices: continue [in_sale_prices, sold_out_prices] = prices detailed_infos.append({ 'date': date, 'time': time, 'in_sale_prices': in_sale_prices, 'sold_out_prices': sold_out_prices }) return json.dumps(detailed_infos)
def get_car_registration(html): if html: try: if(len(html.find("strong.text-right"))>2): res = re.findall("[0-9]+", rm_new_line(str(html.find("strong.text-right")[2].text))) if(len(res)>1): return str(res[1][0:4]+"/"+res[1][4:6]+"/"+res[1][6:8]) elif(len(res) == 1): return str(res[0][0:4]+"/"+res[0][4:6]+"/"+res[0][6:8]) except Exception as e: print('Can\'t get car registration. Reason %s.' % e) return ""
def get_lot_id(html): if html: try: if(len(html.find("h2.tit_style2"))): res = re.search("\[{1}\d+\]{1}", ko_translate(rm_new_line(str(html.find("h2.tit_style2")[0].text)), "en")) if(res): res = res.group() res = re.sub("\[{1}", "", res) res = re.sub("\]{1}", "", res) return res except Exception as e: print('Can\'t get title. Reason %s.' % e) return ""
def get_token(self, html): """Extracts javascript-token from the html file for reuse Sets the attribute on the object so this shouldn't be called externally really ever, other than testing.""" js_token_start = '<input id="javascript-token" name="token" ' + \ 'type="hidden" value="' js_token_startindex = html.find(js_token_start) + 63 js_token_endindex = html.find('"', js_token_startindex + 1) js_token = html[js_token_startindex:js_token_endindex] self.logger.debug(js_token) return js_token
def scrape_examples(self, page): """ Return a list of examples. """ examples = [] html = page index = html.find(TAG_PHRASE_DELIMITER[0]) while index > -1: example_html = Utils.text_between(html, *TAG_PHRASE_DELIMITER, force_html=True) examples += [Utils.remove_spaces(Utils.remove_tags(example_html))] html = html[index+len(TAG_PHRASE_DELIMITER[0]):] index = html.find(TAG_PHRASE_DELIMITER[0]) return examples
def get_car_id(html): if html: try: items = html.find("div.car-title") print(html.find("div.car-title")) for item in items: if(len(item.find("a"))): car_id = re.search('[A-Z]+[0-9]+', item.find("a")[0].attrs['onclick']) print(item.find("a")[0]) print(car_id) if car_id: write_car_id("car_id.txt",car_id.group()+"\n") except Exception as e: print('Can\'t get car id. Reason %s.' % e)
def get_car_year(html): if html: try: if(len(html.find("strong.text-right"))>2): res = re.search("[0-9]+", str(html.find("strong.text-right")[2].text)) if(res): year = res.group() if(len(year) == 4): return int(year) else: return int(year[0:4]) except Exception as e: print('Can\'t get car year. Reason %s.' % e) return ""
async def lyrics(self, ctx, *, song: str = None): """ Get lyrics for a song. Song must be in the format of '{artist} {song}'. Currently this isn't very accurate (direct URL call, not a search) and will likely give a 404. """ if song is None or len(song) == 0: return song_url = "https://genius.com/{}-lyrics".format( song.replace(" ", "-").lower()) page = requests.get(song_url) html = BeautifulSoup(page.text, "html.parser") song_lyrics = html.find("div", class_="lyrics").get_text() if len(song_lyrics) <= 1024: await ctx.send(song_lyrics) else: chunks = [ song_lyrics[i:i + 1024] for i in range(0, len(song_lyrics), 1024) ] for chunk in chunks: await ctx.send(chunk)
def _onHtmlEdit(self, field): d = QDialog(self.widget, Qt.Window) form = aqt.forms.edithtml.Ui_Dialog() form.setupUi(d) restoreGeom(d, "htmlEditor") qconnect(form.buttonBox.helpRequested, lambda: openHelp("editing?id=features")) form.textEdit.setPlainText(self.note.fields[field]) d.show() form.textEdit.moveCursor(QTextCursor.End) d.exec_() html = form.textEdit.toPlainText() if html.find(">") > -1: # filter html through beautifulsoup so we can strip out things like a # leading </div> html_escaped = self.mw.col.media.escape_media_filenames(html) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) html_escaped = str(BeautifulSoup(html_escaped, "html.parser")) html = self.mw.col.media.escape_media_filenames(html_escaped, unescape=True) self.note.fields[field] = html if not self.addMode: self.note.flush() self.loadNote(focusTo=field) saveGeom(d, "htmlEditor")
def _onHtmlEdit(self, field: int) -> None: d = QDialog(self.widget, Qt.WindowType.Window) form = aqt.forms.edithtml.Ui_Dialog() form.setupUi(d) restoreGeom(d, "htmlEditor") disable_help_button(d) qconnect(form.buttonBox.helpRequested, lambda: openHelp(HelpPage.EDITING_FEATURES)) font = QFont("Courier") font.setStyleHint(QFont.StyleHint.TypeWriter) form.textEdit.setFont(font) form.textEdit.setPlainText(self.note.fields[field]) d.show() form.textEdit.moveCursor(QTextCursor.MoveOperation.End) d.exec() html = form.textEdit.toPlainText() if html.find(">") > -1: # filter html through beautifulsoup so we can strip out things like a # leading </div> html_escaped = self.mw.col.media.escape_media_filenames(html) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) html_escaped = str(BeautifulSoup(html_escaped, "html.parser")) html = self.mw.col.media.escape_media_filenames(html_escaped, unescape=True) self.note.fields[field] = html if not self.addMode: self._save_current_note() self.loadNote(focusTo=field) saveGeom(d, "htmlEditor")
def get_distance_driven(html): if html: try: res = re.findall("[0-9]+", rm_new_line(str(html.find("strong.text-right")[5].text))) return "".join(res) except Exception as e: print('Can\'t get driven distance. Reason %s.' % e) return ""
def get_car_estimate(html): if html: try: item = html.find("#body > section.con_top.gray-bg_fin > div:nth-child(2) > div > div > div > table > tbody > tr:nth-child(1) > td")[0].text if item: return "".join(re.findall("[A-Z]", str(item))) except Exception as e: print('Can\'t get car estimate %s. Reason: %s' % e)
def get_car_displacement(html): if html: try: res = re.findall("[0-9]+", rm_new_line(html.find("strong.text-right")[6].text)) return "".join(res) except Exception as e: print('Can\'t get car displacement. Reason %s.' % e) return ""
def get_transmission(html): if html: try: if(rm_new_line(str(html.find("strong.text-right")[8].text))=="오토"): return "Автомат" else: return "Механика" except Exception as e: print('Can\'t get transmission. Reason %s.' % e) return ""
def scrape_synonyms(self, page): """ Return list of synonyms. """ synonyms = [] if page.find(TAG_SYNONYMS[0]) > -1: html = Utils.text_between(page, *TAG_SYNONYMS, force_html=True) while html.find(TAG_SYNONYMS_DELIMITER[0]) > -1: synonym, html = self.first_synonym(html) synonyms.append(synonym) return synonyms
def get_fuel(html): if html: fulel_data = { "가솔린":"Бензин", "휘발유":"Бензин", "경유":"Дизель", "디젤":"Дизель", "LPG":"LPG", "하이브리드":"Гибрид", "LPI하이브리드":"LPG гибрид", "가솔린하이브리드":"Бензиновый гибрид", "디젤하이브리드": "Дизельный гибрид", "전기":"Электрокар", "가솔린/LPG":"Бензин/LPG", "겸용":"Комбинированное использование" } try: if(len(html.find("strong.text-right"))>3): return fulel_data[rm_new_line(str(html.find("strong.text-right")[4].text))] except Exception as e: print('Can\'t get fuel. Reason %s.' % e) return ""
def extract_from_html(html: BeautifulSoup) -> List[Emoji]: emojis = [] for row in html.find('table').find_all('tr'): if row.th: continue emoji = row.find('td', {'class': 'chars'}).string description = row.find('td', { 'class': 'name' }).string.replace('⊛ ', '') emojis.append(Emoji(emoji, description)) return emojis
def get_car_color(html): if html: color_data = { "흰색":"Белый", "은색":"Серебро", "검정":"Черный", "기타":"Так далее", "빨간":"Красный", "보라색":"Фиолетовый", "주황색":"Оранжевый", "초록":"Зеленый", "회색":"Серый", "금":"Золото", "푸른":"Голубой", "베이지":"Бежевый", "빨강":"Красный" } try: if(len(html.find("strong.text-right"))>9): return color_data[re.sub("\s.+","", rm_new_line(str(html.find("strong.text-right")[9].text)))] except Exception as e: print('Can\'t get car color. Reason %s.' % e) return ""
def get_car_description(html, link): description = "" if html: try: vr = re.search("[A-Z0-9]+$", link) if vr: vr = vr.group() except Exception as e: print('Can\'t get car ID for VR 360 view auto. Reason %s.' % e) try: description += '<div class="timeline-heading"><h3>VR360 обзор авто</h3></div>' description += '<iframe frameborder="0" height="600" id="ovulyaciya" scrolling="no" src="http://www.sellcarauction.co.kr/newfront/receive/rc/receive_rc_view_vr.do?isLandscapeOpen=Y&isBrowserOpen=Y&receivecd=%s" width="900"></iframe>' % vr except Exception as e: print('Can\'t get car description "VR 360 view auto". Reason %s.' % e) try: description += '<div class="timeline-heading"><h3>Информация об оценке производительности</h3></div>' description += text_len(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(2) > div > div > div > table")[0].html)), "ru") except Exception as e: print('Can\'t get car description "Performance Evaluation Information". Reason %s.' % e) try: description += '<div class="timeline-heading"><h3>Информация о вариантах</h3></div>' description += text_len(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(3) > div > div > table:nth-child(2)")[0].html)), "ru") except Exception as e: print('Can\'t get car description "Option information". Reason %s.' % e) try: description += text_len(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(3) > div > div > ul")[0].html)), "ru") except Exception as e: print('Can\'t get car descritpion comm list. Reason %s.' % e) description += rm_new_line(get_car_img(html)) # try: # description += '<div class="timeline-heading"><h3>Протокол осмотра</h3></div>' # text = text_len(rm_new_line(str(soup.select("#body > section.con_top.gray-bg_fin > div.container-fluid.wide_area.mt_1.car_view_check_area > div > div > div > table")[0])), "ru") # text = re.sub("\/newfront\/images\/","http://www.sellcarauction.co.kr/newfront/images/",text) # description += text # except Exception as e: # print('Can\'t get car description "Inspection protocol". Reason %s.' % e) return rm_new_line(description)
def get_car_type(html): if html: car_type_data = { "승합 (6인승)":"Универсал", "승용 (7인승)":"Универсал", "승용 (11인승)":"Фургон", "승합 (3인승)":"Фургон", "화물 (3인승)":"Фургон", "승합 (15인승)":"Фургон", "승합 (11인승)":"Фургон", "승합 (12인승)":"Фура", # "Трактор", "승용 (5인승)":"Седан", "승합 (5인승)":"Седан", # "Родстер", # "Пикап", # "Мотоцикл", "승합 (25인승)":"Автобус", "승합 (9인승)":"Минивен", "승용 (9인승)":"Минивен", "승용 (6인승)":"Минивен", "화물 (6인승)":"Минивен", "승용 (4인승)":"Хэтчбек", # "Кроссовер", "승합 (2인승)":"Купе", "화물 (2인승)":"Купе", "승용 (2인승)":"Купе", "SUV픽업 (5인승)":"Внедорожник пикап" # "Кабриолет", # "Багги" } try: if(len(html.find("strong.text-right"))>10): return car_type_data[rm_new_line(str(html.find("strong.text-right")[10].text))] except Exception as e: print('Can\'t get car type. Reason %s.' % e) return ""
def get_img_src(html): if html: try: items = html.find("img.img-fluid") images = [] for item in items: if "src" in item.attrs: match = re.search('\_S\.[jpeg|jpg|JPG|JPEG|gif|png]+$', item.attrs['src']) if match: images.append(item.attrs['src'].replace('_S', '')) else: continue return images except Exception as e: print('Can\'t get img src. Reason %s.' % e) return ""
def putCategory(): test_url = "http://www.bokjiro.go.kr/welInfo/retrieveWelInfoBoxList.do?searchIntClId=01&pageUnit=10&pageIndex=1" resp = requests.get(test_url) html = BeautifulSoup(resp.content, 'html.parser') lis = html.find('div', {'class': 'catBoxIn'}).findAll('a') request_data = {'categories': []} #url로 만들어서 날리기 for li in lis: categoryNum = li.get('href')[29:-7] name = li.get('title') request_data['categories'].append({'id': categoryNum, 'name': name}) categoryCode.append(categoryNum) requests.post(API_URL + 'crawling/category/', data=json.dumps(request_data), headers=headers)
def __fetch_emoji_list(self) -> List[Emoji]: data = requests.get( 'https://unicode.org/emoji/charts-14.0/full-emoji-list.html', timeout=120) # type: requests.Response html = BeautifulSoup(data.text, 'lxml') emojis = [] for row in html.find('table').find_all('tr'): if not row.th: emoji = row.find('td', {'class': 'chars'}).string description = row.find('td', { 'class': 'name' }).string.replace('⊛ ', '') emojis.append(Emoji(emoji, description)) return emojis
def plot(url): a = requests.get(url) html = a.text c = html.find('<h2><span class="mw-headline" id="Plot">') if (c != -1): temp = html[c + 4:] c = temp.find('<h2>') temp = temp[:c] c = temp.find('<p>') temp = temp[c:] plot = temp else: plot = 'not found' s = plot s = re.sub(r'<(.|\n)*?>', '', s) replaced = re.sub(r'[\d]', '', s) return replaced
def _pastePreFilter(self, html: str, internal: bool) -> str: # https://anki.tenderapp.com/discussions/ankidesktop/39543-anki-is-replacing-the-character-by-when-i-exit-the-html-edit-mode-ctrlshiftx if html.find(">") < 0: return html with warnings.catch_warnings() as w: warnings.simplefilter("ignore", UserWarning) doc = BeautifulSoup(html, "html.parser") tag: bs4.element.Tag if not internal: for tag in self.removeTags: for node in doc(tag): node.decompose() # convert p tags to divs for node in doc("p"): node.name = "div" for tag in doc("img"): try: src = tag["src"] except KeyError: # for some bizarre reason, mnemosyne removes src elements # from missing media continue # in internal pastes, rewrite mediasrv references to relative if internal: m = re.match(r"http://127.0.0.1:\d+/(.*)$", src) if m: tag["src"] = m.group(1) else: # in external pastes, download remote media if self.isURL(src): fname = self._retrieveURL(src) if fname: tag["src"] = fname elif src.startswith("data:image/"): # and convert inlined data tag["src"] = self.inlinedImageToFilename(src) html = str(doc) return html
def fetch_emoji_list(self: 'EmojiExtractor') -> List[Emoji]: print('Downloading list of all emojis') data = requests.get( 'https://unicode.org/emoji/charts-13.0/full-emoji-list.html', timeout=120 ) # type: requests.Response html = BeautifulSoup(data.content, 'lxml') emojis = [] for row in html.find('table').find_all('tr'): if row.th: continue emoji = row.find('td', {'class': 'chars'}).string description = row.find('td', {'class': 'name'}).string.replace('⊛ ', '') emojis.append(Emoji(emoji, description)) return emojis
def parse_html_value(html): return html[html.find('>')+1:html.rfind('<')]