예제 #1
0
def get_car_vin(html):
    if html:
        try:
            if(len(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(1) > div.row.mt_5 > div.col-md-4.car-details-sidebar > div > ul > li:nth-child(2) > strong"))):
                return str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(1) > div.row.mt_5 > div.col-md-4.car-details-sidebar > div > ul > li:nth-child(2) > strong")[0].text)
        except Exception as e:
            print('Can\'t get VIN number. Reason %s.' % e)
예제 #2
0
def get_car_title(html):
    if html:
        try:
            if(len(html.find("h2.tit_style2"))):
                return clear_car_name(ko_translate(rm_new_line(str(html.find("h2.tit_style2")[0].text)), "en"))
        except Exception as e:
            print('Can\'t get title. Reason %s.' % e)
            return ""
예제 #3
0
def get_car_mark(html):
    if html:
        try:
            if(len(html.find("#body > section.con_top.gray-bg_fin > div.container-fluid.wide_area.mt_1.car_view_check_area > div > div > div > table > tbody > tr:nth-child(7) > td:nth-child(2)"))):     
                return ko_translate(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div.container-fluid.wide_area.mt_1.car_view_check_area > div > div > div > table > tbody > tr:nth-child(7) > td:nth-child(2)")[0].text)),"en")
        except Exception as e:
            print('Can\'t get car mark. Reason %s.' % e)
            return ""
예제 #4
0
def get_car_price(html):
    if html:
        try:
            if(len(html.find("strong.i_comm_main_txt2"))):
                price = int(re.sub("\,", "", html.find("strong.i_comm_main_txt2")[0].text))
                return int(price*10000/int(config['SELLCAR']['USD']))
        except Exception as e:
            print('Can\'t get car price. Reason %s.' % e)
            return ""
예제 #5
0
파일: parser.py 프로젝트: yagitag/amigos
def getRelatedVideos(html):
  end = 0
  while True:
    start = html.find('video-list-item related-list-item', end)
    if start == -1: break
    (start, end) = search(html, 'href="', '"', start)
    start = html.index('v=', start) + len('v=')
    amp_idx = html.find('&', start, end)
    if amp_idx != -1: end = amp_idx
    yield html[start:end]
예제 #6
0
    def _get_detailed_page(self, url):
        # 由于网站比较特殊,故在这里完成get和parse,并返回一个json_str
        eid = url.split('/')[-1]

        text = self.get_page_by_GET(url)
        html = bs4.BeautifulSoup(text, 'lxml')
        info = html.find('', {'class': 'info'})
        #  title = info.find('', {'class': 'title'}).string.strip()
        address = info.find('', {'class': 'address'}).string.strip()
        buy_button = html.find('', {'class': 'view_sigpnup'}).a
        buy_href = buy_button.attrs['href']
        buy_type = buy_button.string.strip()

        _LOGGER.debug(f'[_get_detailed_page] url: {url}')
        _LOGGER.debug(
            f'[_get_detailed_page] address: {address}, buy_href: {buy_href}, buy_type: {buy_type}, eid: {eid}'
        )
        try:
            urls_and_events = self.__various_get_urls_and_events(
                address, buy_href, buy_type, html, eid)
        except Exception as e:
            _LOGGER.error(e)
            return None

        if urls_and_events is None:
            return None
        urls, events = urls_and_events

        # get page for building infos(or price infos)
        building_page_lsit = utils.async_get_pages_by_GET(urls)

        # parse building infos for price infos(or just return)
        page_lsit = [
            self.__various_parse_for_price_infos(address, buy_type, page, eid,
                                                 events[i]['event_id'])
            for i, page in enumerate(building_page_lsit)
        ]

        detailed_infos = []
        for i, event in enumerate(events):
            date = event['date']
            time = event['time']
            page = page_lsit[i]
            prices = self.__various_get_prices(address, buy_type, page)
            if not prices:
                continue
            [in_sale_prices, sold_out_prices] = prices
            detailed_infos.append({
                'date': date,
                'time': time,
                'in_sale_prices': in_sale_prices,
                'sold_out_prices': sold_out_prices
            })
        return json.dumps(detailed_infos)
예제 #7
0
def get_car_registration(html):
    if html:
        try:
            if(len(html.find("strong.text-right"))>2):
                res = re.findall("[0-9]+", rm_new_line(str(html.find("strong.text-right")[2].text)))
                if(len(res)>1):
                    return str(res[1][0:4]+"/"+res[1][4:6]+"/"+res[1][6:8])
                elif(len(res) == 1):
                    return str(res[0][0:4]+"/"+res[0][4:6]+"/"+res[0][6:8])
        except Exception as e:
            print('Can\'t get car registration. Reason %s.' % e)
            return ""
예제 #8
0
def get_lot_id(html):
    if html:
        try:
            if(len(html.find("h2.tit_style2"))):
                res = re.search("\[{1}\d+\]{1}", ko_translate(rm_new_line(str(html.find("h2.tit_style2")[0].text)), "en"))
                if(res):
                    res = res.group()
                    res = re.sub("\[{1}", "", res)
                    res = re.sub("\]{1}", "", res)
                    return res
        except Exception as e:
            print('Can\'t get title. Reason %s.' % e)
            return ""
예제 #9
0
    def get_token(self, html):
        """Extracts javascript-token from the html file for reuse
    
        Sets the attribute on the object so this shouldn't be called
        externally really ever, other than testing."""
        js_token_start = '<input id="javascript-token" name="token" ' + \
        'type="hidden" value="'
        js_token_startindex = html.find(js_token_start) + 63
        js_token_endindex = html.find('"', js_token_startindex + 1)
        js_token = html[js_token_startindex:js_token_endindex]
        self.logger.debug(js_token)

        return js_token
예제 #10
0
    def get_token(self, html):
        """Extracts javascript-token from the html file for reuse
    
        Sets the attribute on the object so this shouldn't be called
        externally really ever, other than testing."""
        js_token_start = '<input id="javascript-token" name="token" ' + \
        'type="hidden" value="'
        js_token_startindex = html.find(js_token_start) + 63
        js_token_endindex = html.find('"', js_token_startindex + 1)
        js_token = html[js_token_startindex:js_token_endindex]
        self.logger.debug(js_token)

        return js_token
예제 #11
0
 def scrape_examples(self, page):
     """
     Return a list of examples.
     """
     examples = []
     html = page
     index = html.find(TAG_PHRASE_DELIMITER[0])
     while index > -1:
         example_html = Utils.text_between(html, *TAG_PHRASE_DELIMITER, force_html=True)
         examples += [Utils.remove_spaces(Utils.remove_tags(example_html))]
         html = html[index+len(TAG_PHRASE_DELIMITER[0]):]
         index = html.find(TAG_PHRASE_DELIMITER[0])
     return examples
예제 #12
0
def get_car_id(html):
    if html:
        try:
            items = html.find("div.car-title")
            print(html.find("div.car-title"))
            for item in items:
                if(len(item.find("a"))):
                    car_id = re.search('[A-Z]+[0-9]+', item.find("a")[0].attrs['onclick'])
                    print(item.find("a")[0])
                    print(car_id)
                    if car_id:
                        write_car_id("car_id.txt",car_id.group()+"\n")
        except Exception as e:
            print('Can\'t get car id. Reason %s.' % e)
예제 #13
0
def get_car_year(html):
    if html:
        try:
            if(len(html.find("strong.text-right"))>2):
                res = re.search("[0-9]+", str(html.find("strong.text-right")[2].text))
                if(res):
                    year = res.group()
                    if(len(year) == 4):
                        return int(year)
                    else:
                        return int(year[0:4])
        except Exception as e:
            print('Can\'t get car year. Reason %s.' % e)
            return ""
예제 #14
0
    async def lyrics(self, ctx, *, song: str = None):
        """
        Get lyrics for a song.
        Song must be in the format of '{artist} {song}'.
        Currently this isn't very accurate (direct URL call, not a search) and will likely give a 404.
        """
        if song is None or len(song) == 0:
            return

        song_url = "https://genius.com/{}-lyrics".format(
            song.replace(" ", "-").lower())

        page = requests.get(song_url)
        html = BeautifulSoup(page.text, "html.parser")
        song_lyrics = html.find("div", class_="lyrics").get_text()

        if len(song_lyrics) <= 1024:
            await ctx.send(song_lyrics)
        else:
            chunks = [
                song_lyrics[i:i + 1024]
                for i in range(0, len(song_lyrics), 1024)
            ]
            for chunk in chunks:
                await ctx.send(chunk)
예제 #15
0
파일: editor.py 프로젝트: yinminshcn/anki
 def _onHtmlEdit(self, field):
     d = QDialog(self.widget, Qt.Window)
     form = aqt.forms.edithtml.Ui_Dialog()
     form.setupUi(d)
     restoreGeom(d, "htmlEditor")
     qconnect(form.buttonBox.helpRequested,
              lambda: openHelp("editing?id=features"))
     form.textEdit.setPlainText(self.note.fields[field])
     d.show()
     form.textEdit.moveCursor(QTextCursor.End)
     d.exec_()
     html = form.textEdit.toPlainText()
     if html.find(">") > -1:
         # filter html through beautifulsoup so we can strip out things like a
         # leading </div>
         html_escaped = self.mw.col.media.escape_media_filenames(html)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", UserWarning)
             html_escaped = str(BeautifulSoup(html_escaped, "html.parser"))
             html = self.mw.col.media.escape_media_filenames(html_escaped,
                                                             unescape=True)
     self.note.fields[field] = html
     if not self.addMode:
         self.note.flush()
     self.loadNote(focusTo=field)
     saveGeom(d, "htmlEditor")
예제 #16
0
파일: editor.py 프로젝트: rye761/anki
 def _onHtmlEdit(self, field: int) -> None:
     d = QDialog(self.widget, Qt.WindowType.Window)
     form = aqt.forms.edithtml.Ui_Dialog()
     form.setupUi(d)
     restoreGeom(d, "htmlEditor")
     disable_help_button(d)
     qconnect(form.buttonBox.helpRequested,
              lambda: openHelp(HelpPage.EDITING_FEATURES))
     font = QFont("Courier")
     font.setStyleHint(QFont.StyleHint.TypeWriter)
     form.textEdit.setFont(font)
     form.textEdit.setPlainText(self.note.fields[field])
     d.show()
     form.textEdit.moveCursor(QTextCursor.MoveOperation.End)
     d.exec()
     html = form.textEdit.toPlainText()
     if html.find(">") > -1:
         # filter html through beautifulsoup so we can strip out things like a
         # leading </div>
         html_escaped = self.mw.col.media.escape_media_filenames(html)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", UserWarning)
             html_escaped = str(BeautifulSoup(html_escaped, "html.parser"))
             html = self.mw.col.media.escape_media_filenames(html_escaped,
                                                             unescape=True)
     self.note.fields[field] = html
     if not self.addMode:
         self._save_current_note()
     self.loadNote(focusTo=field)
     saveGeom(d, "htmlEditor")
예제 #17
0
def get_distance_driven(html):
    if html:
        try:
            res = re.findall("[0-9]+", rm_new_line(str(html.find("strong.text-right")[5].text)))
            return "".join(res)
        except Exception as e:
            print('Can\'t get driven distance. Reason %s.' % e)
            return ""
예제 #18
0
def get_car_estimate(html):
    if html:
        try:
            item = html.find("#body > section.con_top.gray-bg_fin > div:nth-child(2) > div > div > div > table > tbody > tr:nth-child(1) > td")[0].text
            if item:
                return "".join(re.findall("[A-Z]", str(item)))
        except Exception as e:
            print('Can\'t get car estimate %s. Reason: %s' % e)
예제 #19
0
def get_car_displacement(html):
    if html:
        try:
            res = re.findall("[0-9]+", rm_new_line(html.find("strong.text-right")[6].text))
            return "".join(res)
        except Exception as e:
            print('Can\'t get car displacement. Reason %s.' % e)
            return ""
예제 #20
0
def get_transmission(html):
    if html:
        try:
            if(rm_new_line(str(html.find("strong.text-right")[8].text))=="오토"):
                return "Автомат"
            else:
                return "Механика"
        except Exception as e:
            print('Can\'t get transmission. Reason %s.' % e)
            return ""
예제 #21
0
파일: dicio.py 프로젝트: renatoviolin/dicio
 def scrape_synonyms(self, page):
     """
     Return list of synonyms.
     """
     synonyms = []
     if page.find(TAG_SYNONYMS[0]) > -1:
         html = Utils.text_between(page, *TAG_SYNONYMS, force_html=True)
         while html.find(TAG_SYNONYMS_DELIMITER[0]) > -1:
             synonym, html = self.first_synonym(html)
             synonyms.append(synonym)
     return synonyms
예제 #22
0
def get_fuel(html):
    if html:
        fulel_data = {
            "가솔린":"Бензин",
            "휘발유":"Бензин",
            "경유":"Дизель",
            "디젤":"Дизель",
            "LPG":"LPG",
            "하이브리드":"Гибрид",
            "LPI하이브리드":"LPG гибрид",
            "가솔린하이브리드":"Бензиновый гибрид",
            "디젤하이브리드": "Дизельный гибрид",
            "전기":"Электрокар",
            "가솔린/LPG":"Бензин/LPG",
            "겸용":"Комбинированное использование"
        }
        try:
            if(len(html.find("strong.text-right"))>3):
                return fulel_data[rm_new_line(str(html.find("strong.text-right")[4].text))]
        except Exception as e:
            print('Can\'t get fuel. Reason %s.' % e)
            return ""
예제 #23
0
def extract_from_html(html: BeautifulSoup) -> List[Emoji]:
    emojis = []

    for row in html.find('table').find_all('tr'):
        if row.th:
            continue
        emoji = row.find('td', {'class': 'chars'}).string
        description = row.find('td', {
            'class': 'name'
        }).string.replace('⊛ ', '')

        emojis.append(Emoji(emoji, description))

    return emojis
예제 #24
0
def get_car_color(html):
    if html:
        color_data = {
            "흰색":"Белый",
            "은색":"Серебро",
            "검정":"Черный",
            "기타":"Так далее",
            "빨간":"Красный",
            "보라색":"Фиолетовый",
            "주황색":"Оранжевый",
            "초록":"Зеленый",
            "회색":"Серый",
            "금":"Золото",
            "푸른":"Голубой",
            "베이지":"Бежевый",
            "빨강":"Красный"
        }
        try:
            if(len(html.find("strong.text-right"))>9):
                return color_data[re.sub("\s.+","", rm_new_line(str(html.find("strong.text-right")[9].text)))]
        except Exception as e:
            print('Can\'t get car color. Reason %s.' % e)
            return ""
예제 #25
0
def get_car_description(html, link):
    description = ""
    if html:
        try:
            vr = re.search("[A-Z0-9]+$", link)
            if vr:
                vr = vr.group()
        except Exception as e:
            print('Can\'t get car ID for VR 360 view auto. Reason %s.' % e)
        try:
            description += '<div class="timeline-heading"><h3>VR360 обзор авто</h3></div>'
            description += '<iframe frameborder="0" height="600" id="ovulyaciya" scrolling="no" src="http://www.sellcarauction.co.kr/newfront/receive/rc/receive_rc_view_vr.do?isLandscapeOpen=Y&amp;isBrowserOpen=Y&amp;receivecd=%s" width="900"></iframe>' % vr
        except Exception as e:
            print('Can\'t get car description "VR 360 view auto". Reason %s.' % e)
        try:
            description += '<div class="timeline-heading"><h3>Информация об оценке производительности</h3></div>'
            description += text_len(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(2) > div > div > div > table")[0].html)), "ru")
        except Exception as e:
            print('Can\'t get car description "Performance Evaluation Information". Reason %s.' % e)
        try:
            description += '<div class="timeline-heading"><h3>Информация о вариантах</h3></div>'
            description += text_len(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(3) > div > div > table:nth-child(2)")[0].html)), "ru")
        except Exception as e:
            print('Can\'t get car description "Option information". Reason %s.' % e)
        try:   
            description += text_len(rm_new_line(str(html.find("#body > section.con_top.gray-bg_fin > div:nth-child(3) > div > div > ul")[0].html)), "ru")
        except Exception as e:
            print('Can\'t get car descritpion comm list. Reason %s.' % e)
        description += rm_new_line(get_car_img(html))
        # try:
        #     description += '<div class="timeline-heading"><h3>Протокол осмотра</h3></div>'
        #     text = text_len(rm_new_line(str(soup.select("#body > section.con_top.gray-bg_fin > div.container-fluid.wide_area.mt_1.car_view_check_area > div > div > div > table")[0])), "ru")
        #     text = re.sub("\/newfront\/images\/","http://www.sellcarauction.co.kr/newfront/images/",text)
        #     description += text
        # except Exception as e:
        #     print('Can\'t get car description "Inspection protocol". Reason %s.' % e)
        return rm_new_line(description)
예제 #26
0
def get_car_type(html):
    if html:
        car_type_data = {
            "승합 (6인승)":"Универсал",
            "승용 (7인승)":"Универсал",
            "승용 (11인승)":"Фургон",
            "승합 (3인승)":"Фургон",
            "화물 (3인승)":"Фургон",
            "승합 (15인승)":"Фургон",
            "승합 (11인승)":"Фургон",
            "승합 (12인승)":"Фура",
            # "Трактор",
            "승용 (5인승)":"Седан",
            "승합 (5인승)":"Седан",
            # "Родстер",
            # "Пикап",
            # "Мотоцикл",
            "승합 (25인승)":"Автобус",
            "승합 (9인승)":"Минивен",
            "승용 (9인승)":"Минивен",
            "승용 (6인승)":"Минивен",
            "화물 (6인승)":"Минивен",
            "승용 (4인승)":"Хэтчбек",
            # "Кроссовер",
            "승합 (2인승)":"Купе",
            "화물 (2인승)":"Купе",
            "승용 (2인승)":"Купе",
            "SUV픽업 (5인승)":"Внедорожник пикап"
            # "Кабриолет",
            # "Багги"
        }
        try:
            if(len(html.find("strong.text-right"))>10):
                return car_type_data[rm_new_line(str(html.find("strong.text-right")[10].text))]
        except Exception as e:
            print('Can\'t get car type. Reason %s.' % e)
            return ""
예제 #27
0
def get_img_src(html):
    if html:
        try:
            items = html.find("img.img-fluid")
            images = []
            for item in items:
                if "src" in item.attrs:
                    match = re.search('\_S\.[jpeg|jpg|JPG|JPEG|gif|png]+$', item.attrs['src'])
                    if match:
                        images.append(item.attrs['src'].replace('_S', ''))
                    else: 
                        continue
            return images
        except Exception as e:
            print('Can\'t get img src. Reason %s.' % e)
            return ""
예제 #28
0
def putCategory():
    test_url = "http://www.bokjiro.go.kr/welInfo/retrieveWelInfoBoxList.do?searchIntClId=01&pageUnit=10&pageIndex=1"
    resp = requests.get(test_url)
    html = BeautifulSoup(resp.content, 'html.parser')
    lis = html.find('div', {'class': 'catBoxIn'}).findAll('a')

    request_data = {'categories': []}
    #url로 만들어서 날리기
    for li in lis:
        categoryNum = li.get('href')[29:-7]
        name = li.get('title')
        request_data['categories'].append({'id': categoryNum, 'name': name})
        categoryCode.append(categoryNum)

    requests.post(API_URL + 'crawling/category/',
                  data=json.dumps(request_data),
                  headers=headers)
예제 #29
0
    def __fetch_emoji_list(self) -> List[Emoji]:
        data = requests.get(
            'https://unicode.org/emoji/charts-14.0/full-emoji-list.html',
            timeout=120)  # type: requests.Response

        html = BeautifulSoup(data.text, 'lxml')

        emojis = []
        for row in html.find('table').find_all('tr'):
            if not row.th:
                emoji = row.find('td', {'class': 'chars'}).string
                description = row.find('td', {
                    'class': 'name'
                }).string.replace('⊛ ', '')
                emojis.append(Emoji(emoji, description))

        return emojis
예제 #30
0
def plot(url):

    a = requests.get(url)
    html = a.text
    c = html.find('<h2><span class="mw-headline" id="Plot">')
    if (c != -1):
        temp = html[c + 4:]
        c = temp.find('<h2>')
        temp = temp[:c]
        c = temp.find('<p>')
        temp = temp[c:]
        plot = temp
    else:
        plot = 'not found'
    s = plot
    s = re.sub(r'<(.|\n)*?>', '', s)
    replaced = re.sub(r'&#91;\d&#93;', '', s)
    return replaced
예제 #31
0
파일: editor.py 프로젝트: yinminshcn/anki
    def _pastePreFilter(self, html: str, internal: bool) -> str:
        # https://anki.tenderapp.com/discussions/ankidesktop/39543-anki-is-replacing-the-character-by-when-i-exit-the-html-edit-mode-ctrlshiftx
        if html.find(">") < 0:
            return html

        with warnings.catch_warnings() as w:
            warnings.simplefilter("ignore", UserWarning)
            doc = BeautifulSoup(html, "html.parser")

        tag: bs4.element.Tag
        if not internal:
            for tag in self.removeTags:
                for node in doc(tag):
                    node.decompose()

            # convert p tags to divs
            for node in doc("p"):
                node.name = "div"

        for tag in doc("img"):
            try:
                src = tag["src"]
            except KeyError:
                # for some bizarre reason, mnemosyne removes src elements
                # from missing media
                continue

            # in internal pastes, rewrite mediasrv references to relative
            if internal:
                m = re.match(r"http://127.0.0.1:\d+/(.*)$", src)
                if m:
                    tag["src"] = m.group(1)
            else:
                # in external pastes, download remote media
                if self.isURL(src):
                    fname = self._retrieveURL(src)
                    if fname:
                        tag["src"] = fname
                elif src.startswith("data:image/"):
                    # and convert inlined data
                    tag["src"] = self.inlinedImageToFilename(src)

        html = str(doc)
        return html
예제 #32
0
    def fetch_emoji_list(self: 'EmojiExtractor') -> List[Emoji]:
        print('Downloading list of all emojis')

        data = requests.get(
            'https://unicode.org/emoji/charts-13.0/full-emoji-list.html',
            timeout=120
        )  # type: requests.Response

        html = BeautifulSoup(data.content, 'lxml')

        emojis = []
        for row in html.find('table').find_all('tr'):
            if row.th:
                continue
            emoji = row.find('td', {'class': 'chars'}).string
            description = row.find('td', {'class': 'name'}).string.replace('⊛ ', '')

            emojis.append(Emoji(emoji, description))

        return emojis
예제 #33
0
파일: utils.py 프로젝트: chengjun/Research
def parse_html_value(html):

    return html[html.find('>')+1:html.rfind('<')]