Пример #1
0
class ParserBS(AbstractParser):
    """
    The custom parser over BeautifulSoup
    """
    def __init__(self, html_raw: str, parser_bs_type: str = "html.parser"):
        self.html_parsed = BeautifulSoup(html_raw, parser_bs_type)

    @property
    def html_raw(self) -> str:
        return self.html_parsed.__str__()

    @cached_property
    def title(self) -> str:
        title = self.html_parsed.find("title")
        return title and title.text or ""

    @cached_property
    def anchor_nodes(self) -> Iterable[ResultSet]:
        return self.html_parsed.find_all("a", attrs={"href": True})

    def get_related_anchors_href(self) -> Iterable[str]:
        collection: Set[str] = set()

        for node in self.anchor_nodes:
            href: str = node.attrs.get("href")
            if not ParserBS._is_href_url_related(href):
                continue
            collection.add(href)

        return collection

    def __repr__(self):
        return self.html_parsed.__repr__()
Пример #2
0
def catFromContrib(username):
    url = r'https://bn.wikipedia.org/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%85%E0%A6%AC%E0%A6%A6%E0%A6%BE%E0%A6%A8/' + username + '&offset=&limit=500'
    response = requests.get(url)
    div = BeautifulSoup(unescape(response.text), "html.parser").find_all(
        "ul", {"class": "mw-contributions-list"})
    pagenamelist = re.findall(
        r'<a.*?class="mw-contributions-title".*?>(.*?)</a>', div.__str__())
    pagenamelist = [page for page in pagenamelist if isMainspace(page)]
    pagenamelist = list(dict.fromkeys(pagenamelist))
    catlist = []

    for page in pagenamelist:
        response = requests.get('https://bn.wikipedia.org/w/api.php',
                                params={
                                    'action': 'query',
                                    'format': 'json',
                                    'titles': page,
                                    'prop': 'revisions',
                                    'rvprop': 'content'
                                }).json()
        data = next(iter(
            response['query']['pages'].values()))['revisions'][0]['*']
        l = re.findall(r'\[\[বিষয়শ্রেণী:(.*?)\]\]', data)
        catlist = catlist + l
    catlist = list(dict.fromkeys(catlist))
    return catlist
Пример #3
0
def get_detail_data(url_detail):
    time.sleep(2)
    page = urlopen(url_detail)
    soup = BeautifulSoup(page, 'html.parser')

    company_info = soup.findAll("div", {"class": "detail_intro"}).__str__()

    mobile = has_inside(re.findall(r'Mobile.+"([0-9\s]+)"', company_info))
    fax = has_inside(re.findall(r'Fax.+"([0-9\s]+)"', company_info))

    services = list_of_services(
        soup.findAll("a", {"class": "servicesglossary"}))

    # type of surveyor extraction
    tos = soup.findAll("div", {"class": "rCol"})[0].__str__().replace('\n', '')
    buss_type = has_inside(re.findall(r'<h4>Business type<\/h4><p>(.+?)<',
                                      tos))

    tos = soup.__str__().replace('\n', '')
    type_of_srv = type_of_surveyor(
        has_inside(re.findall(r'<h4>Type of surveyor<\/h4><p>(.+?)<', tos)))

    # managers
    tos = soup.findAll("p")
    for t in tos:
        if 'Mr' in t.getText() or 'Mrs' in t.getText():
            mng_list = list(mng.rstrip().lstrip()
                            for mng in t.getText().split('•'))
            for _ in range(5 - len(mng_list)):
                mng_list.append('')
            break
        else:
            mng_list = []
    contact1, contact2, contact3, contact4, contact5 = mng_list[:5]
    return mobile, fax, services, buss_type, type_of_srv, contact1, contact2, contact3, contact4, contact5
Пример #4
0
    def crawl_article_content(url, title):
        res = requests.get(url)  # 爬取文章的内容
        bs_obj = BeautifulSoup(res.content, "html.parser")
        while bs_obj.find("img", id="seccodeImage") is not None:
            WechatArticleCrawler.headers["Cookie"] = input("输入新的Cookie")
            bs_obj = BeautifulSoup(res.content, "html.parser")
            return ""
        if res.status_code == 404:
            print(url)
            return ""
        js_content = bs_obj.find("div", id="js_content")
        if js_content is None:
            return ""
        # WechatArticleCrawler.save_file(bs_obj.__str__(), title + ".html")  # 保存源代码
        parse_cnt = WechatArticleCrawler.parse_js_content(
            js_content)  # 获取转化后的形式

        # WechatArticleCrawler.save_file(parse_cnt, url[28:])
        cnt_file_name = url.replace("/", "").replace(":", "")
        WechatArticleCrawler.save_file(parse_cnt, cnt_file_name)
        abstract_cnt = js_content.get_text()[:51].replace("\n",
                                                          "")  # 获取文章的摘要,并保存摘要
        # WechatArticleCrawler.save_file(abstract_cnt, url[28:] + "_abstract")
        WechatArticleCrawler.save_file(abstract_cnt,
                                       cnt_file_name + "_abstract")
        image_url = re.search(WechatArticleCrawler.pattern,
                              bs_obj.__str__()).group(1)
        return image_url
Пример #5
0
def clean(raw):
    t = BeautifulSoup(raw, "lxml").find('article')
    t.find('h2').decompose()
    [x.decompose() for x in t.find_all('a')]
    [x.decompose() for x in t.find_all('div', style=lambda v: v)]
    [x.decompose() for x in t.find_all('div', {'class': 'spacer'})]
    [x.decompose() for x in t.find_all('nav')]
    [x.decompose() for x in t.find_all('div', {'id': 'comments'})]
    [x.decompose() for x in t.find_all('div', {'class': 'helpers'})]
    [x.decompose() for x in t.find_all('div', {'class': 'cat'})]
    [x.decompose() for x in t.find_all('div', {'class': 'com'})]
    t = t.__str__().replace('\n\n\n\n', '').replace('<p>',
                                                    '').replace('</p>', '')
    t = t.replace('</article>', '').replace('</div>', '').replace('\r', '')
    t = t.replace('<small>', '').replace('</small>', '')
    t = t.replace('<em>', '').replace('</em>', '')
    t = t.replace('<sup>', '').replace('</sup>', '')
    t = t.replace('<br style="clear:both;"/>', '')
    # special tokens
    t = t.replace('<article>', '[A]')
    t = t.replace('<div class="n">', '[N]')
    t = t.replace('<div class="b">', '[B]')
    t = t.replace('<div class="m1">', '[M1]')
    t = t.replace('<div class="m2">', '[M2]')
    t = t.replace('<div class="b2">', '[B2]')
    return t
Пример #6
0
def removeHtml(string):
    result = BeautifulSoup(string).findAll(text=True)
    strResult = str(result.__str__())
    strResult = strResult.strip("[u'")
    strResult = strResult.strip("']")
    strResult = strResult.strip("\n")
    strResult = strResult.strip("u'")
    return strResult
Пример #7
0
def get_content(url):
    r = requests.get(url).content
    str_content = r.decode('utf-8')
    soup = BeautifulSoup(str_content, 'html.parser')
    # content = soup.select('p')
    fp = open("contents.txt", "w", encoding='utf-8')
    # for c in content:
    fp.write(soup.__str__())
    fp.close()
Пример #8
0
    def parse(self, code, filter=None):

        if filter is None:
            filter = BasicFilter()

        soup = BeautifulSoup(code, "html.parser")
        filter.clean(soup)
        # return soup.prettify(encoding=None);
        return soup.__str__()
Пример #9
0
def find_answer(name, keywords, path):
    logger.info("Start to find sentence.")
    f = open(path, 'r+', encoding='utf8')
    lines = f.read()
    soup = BeautifulSoup(lines)
    for s in soup('script'):
        s.extract()
    for s in soup('style'):
        s.extract()
    lines = soup.__str__()
    reg2 = re.compile('<[^>]*>')
    lines = reg2.sub('', lines)
    reg3 = re.compile('-->')
    lines = reg3.sub('', lines)
    reg4 = re.compile('&(\S)?gt')
    lines = reg4.sub('', lines)
    reg5 = re.compile('New!')
    lines = reg5.sub('', lines)
    reg6 = re.compile('  ')
    lines = reg6.sub('', lines)
    ans = lines.split("\n")
    content = []
    for sentence in ans:
        if not sentence == '':
            content.append(sentence)
        if sentence == '法律声明':
            break
    # print(content)

    num = 0
    sentence_num = 0
    aim_sentence = content[0]
    flag = 0
    for sentence in reversed(content):
        sentence_num += 1
        if sentence_num % 100 == 0:
            logger.info("Find {0:d} sentence.".format(sentence_num))
        if sentence == name:  # 排除与节点名相同的句子
            flag = 1
            continue

        # if flag == 1:
        #     aim_sentence = sentence
        #     break

        match_key = jieba.analyse.extract_tags(sentence,
                                               topK=10,
                                               withWeight=True)  # 从输入中提取关键词
        match_key = [word for word in match_key
                     if word not in stoplist]  # 去停用词

        if calculate_sentence_vector(keywords, match_key) > num:
            num = calculate_sentence_vector(keywords, match_key)
            aim_sentence = sentence
    logger.info("Sentence has already been found.")
    return aim_sentence
    def process(self):
        self.cursor1.execute(
            """select page_id, text.old_text from page 
			join text on text.old_id = page.page_latest 
			where page_id = 18938265 OR page_id = 3732122"""
        )
        row1 = self.cursor1.fetchone()
        counter = 0
        parsed_content = creole2html(row1[1].decode("utf-8"))
        parsed_content = parsed_content.replace("&lt;", "<")
        parsed_content = parsed_content.replace("&gt;", ">")
        soup = BeautifulSoup(parsed_content)
        raw = nltk.clean_html(soup.__str__())
        row1 = self.cursor1.fetchone()
        parsed_content = creole2html(row1[1].decode("utf-8"))
        parsed_content = parsed_content.replace("&lt;", "<")
        parsed_content = parsed_content.replace("&gt;", ">")
        soup = BeautifulSoup(parsed_content)
        raw2 = nltk.clean_html(soup.__str__())
        # lda.print_lda([raw],[raw2])
        print lda.get_similarity(raw, raw2)
        counter += 1
        self.cursor1.close()
        self.db.close()
Пример #11
0
def parse_article(article_content):
    article_soup = BeautifulSoup(article_content, 'html.parser')

    for table in article_soup.find_all('table', {'class': 'infobox'}):
        table.decompose()
    [
        a.decompose()
        for a in article_soup.find_all('a', {'class': 'mw-jump-link'})
    ]
    [a.decompose() for a in article_soup.find_all('a', {'class': 'image'})]
    article_soup.find('div', id='catlinks').decompose()

    html_test = os.path.join('/tmp', 'test.html')
    with open(html_test, 'w') as f:
        f.write(article_soup.__str__())
Пример #12
0
def recent_stats(category: str, page_num: int) -> str:
    """
    1. DataCollection/TwitterStatsBatch/var/UraakaPickUp/recent.htmlからデータをロードする
    2. コンテンツを画面にフィットさせるためにJSが必要でそれを読み込むために、soupも必要
    """
    if isinstance(page_num, str):
        page_num = int(page_num)
    with open(
            f'{TOP_DIR}/DataCollection/TwitterStatsBatch/var/UraakaPickUp/recents/recent_{category}_50000_{page_num}.html'
    ) as fp:
        html = fp.read()
    soup = BeautifulSoup(html, "lxml")
    # print(BeautifulSoup(ResponsibleDevices.responsible_devices(), "lxml"))
    soup.find("body").insert(
        0, BeautifulSoup(ResponsibleDevices.responsible_devices(), "lxml"))
    return soup.__str__()
Пример #13
0
def inCategory(categoryname):
    http = re.findall(r'https://', categoryname)
    if http:
        url = categoryname
    else:
        url = 'https://bn.wikipedia.org/wiki/' + categoryname
    response = requests.get(url)
    l = BeautifulSoup(unescape(response.text),
                      "html.parser").find_all("div", {"id": "mw-pages"})
    pages = re.findall('<li><a.*?>(.*?)</a></li>', l.__str__())
    next = [
        i.get("href") for i in BeautifulSoup(unescape(response.text),
                                             "html.parser").find_all("a")
        if i.text == "পরবর্তী পাতা"
    ]
    if next:
        pages = pages + inCategory('https://bn.wikipedia.org/' + next[0])
    return pages
Пример #14
0
def load_player_stat(url):
    # PhantomJS files have different extensions
    # under different operating systems
    if platform.system() == 'Windows':
        PHANTOMJS_PATH = './phantomjs.exe'
    else:
        PHANTOMJS_PATH = './phantomjs'

    # here we'll use pseudo browser PhantomJS,
    # but browser can be replaced with browser = webdriver.FireFox(),
    # which is good for debugging.
    browser = webdriver.PhantomJS(PHANTOMJS_PATH)
    # browser.get('http://www.basketball-reference.com/leagues/NBA_2017.html#team-stats-base::none')
    browser.get(url)

    # let's parse our html
    soup = BeautifulSoup(browser.page_source, "html.parser")
    browser.quit()

    ps_df = pd.read_html(soup.__str__())[0]

    # use first row as header
    ps_df.columns = ps_df.columns = ps_df.iloc[0]
    ps_df = ps_df.reindex(ps_df.index.drop(0))

    # remove special characters in player name
    ps_df['Player'] = ps_df['Player'].str.replace("[',.-]", '')

    # normalize names
    ps_df['Player'] = ps_df['Player'].str.replace('Jose Juan Barea',
                                                  'JJ Barea')
    ps_df['Player'] = ps_df['Player'].str.replace('Glenn Robinson',
                                                  'Glenn Robinson III')
    ps_df['Player'] = ps_df['Player'].str.replace('Kelly Oubre',
                                                  'Kelly Oubre Jr')
    ps_df['Player'] = ps_df['Player'].str.replace('Nene', 'Nene Hilario')
    ps_df['Player'] = ps_df['Player'].str.replace('Juan Hernangomez',
                                                  'Juancho Hernangomez')
    ps_df['Player'] = ps_df['Player'].str.replace('Willy Hernangomez',
                                                  'Guillermo Hernangomez')
    ps_df['Player'] = ps_df['Player'].str.replace('Luc Mbah a Moute',
                                                  'Luc Richard Mbah a Moute')

    return ps_df
Пример #15
0
def load_player_stat(url):
    # PhantomJS files have different extensions
    # under different operating systems
    if platform.system() == 'Windows':
        PHANTOMJS_PATH = './phantomjs.exe'
    else:
        PHANTOMJS_PATH = './phantomjs'

    # here we'll use pseudo browser PhantomJS,
    # but browser can be replaced with browser = webdriver.FireFox(),
    # which is good for debugging.
    browser = webdriver.PhantomJS(PHANTOMJS_PATH)
    # browser.get('http://www.basketball-reference.com/leagues/NBA_2017.html#team-stats-base::none')
    browser.get(url)

    # let's parse our html
    soup = BeautifulSoup(browser.page_source, "html.parser")
    browser.quit()

    ps_df = pd.read_html(soup.__str__())[0]

    # use first row as header
    ps_df.columns = ps_df.columns = ps_df.iloc[0]
    ps_df = ps_df.reindex(ps_df.index.drop(0))

    # remove special characters in player name
    ps_df['Player'] = ps_df['Player'].str.replace("[',.-]", '')

    # normalize names
    ps_df['Player'] = ps_df['Player'].str.replace('Jose Juan Barea', 'JJ Barea')
    ps_df['Player'] = ps_df['Player'].str.replace('Glenn Robinson', 'Glenn Robinson III')
    ps_df['Player'] = ps_df['Player'].str.replace('Kelly Oubre', 'Kelly Oubre Jr')
    ps_df['Player'] = ps_df['Player'].str.replace('Nene', 'Nene Hilario')
    ps_df['Player'] = ps_df['Player'].str.replace('Juan Hernangomez', 'Juancho Hernangomez')
    ps_df['Player'] = ps_df['Player'].str.replace('Willy Hernangomez', 'Guillermo Hernangomez')
    ps_df['Player'] = ps_df['Player'].str.replace('Luc Mbah a Moute', 'Luc Richard Mbah a Moute')

    return ps_df
Пример #16
0
    def cleanup(self, data):
        data = data.replace('</uniqueindex1>', '</uniqueindex>')
        data = data.replace('</fulltextindex2>', '</fulltextindex>')
        data = data.replace('</index3>', '</index>')
        data = data.replace('<uniqueindex1 ', '<uniqueindex ')
        data = data.replace('<fulltextindex2 ', '<fulltextindex ')
        data = data.replace('<index3 ', '<index ')
        data = data.replace(strComma, ",")
        data = data.replace("\n>", ">")
        data = data.replace(strReferenced_tbl_End, "")

        soup = BeautifulSoup(data)
        soup.prettify()
        data = soup.__str__()

        data = data.replace("<property", "\t<property")
        data = data.replace("<primarykey", "\t<primarykey")
        data = data.replace("</primarykey", "\t</primarykey")
        data = data.replace("<index", "\t<index")
        data = data.replace("</index", "\t</index")
        data = data.replace("<column", "\t\t<column")
        data = data.replace("<constraint", "\t<constraint")
        data = data.replace("</constraint", "\t</constraint")
        data = data.replace("<reference", "\t\t<reference")
        data = data.replace("></entity>", ">\n</entity>")
        data = data.replace("<uniqueindex", "\t<uniqueindex")
        data = data.replace("</uniqueindex", "\t</uniqueindex")
        data = data.replace(",=\"\"", "")
        data = data.replace(",=\"\" ", "")
        data = data.replace("'=\"\" ", "")


        newdata = data.replace("\n\n", "\n")
        while data != newdata:
            data = newdata
            newdata = data.replace("\n\n", "\n")

        return data
Пример #17
0
def handler_audio(act1):
    #query = input('What are you searching for:   ')
    url = 'http://www.google.com/search?q='
    page = requests.get(url + str(act1.name))
    soup = BeautifulSoup(page.text, 'html.parser')
    h3 = soup.find_all("h3", class_="r")
    for elem in h3:
        pb['value'] = 100
        time.sleep(1.5)
        elem = elem.contents[0]
        link = ("https://www.google.com" + elem["href"])
        if link.find('music.yandex.ru') != -1:
            print('Его нельзя: ' + link)
        elif link.find('youtube') != -1:
            print('Его нельзя: ' + link)
        elif link.find('text-lyrics.ru') != -1 or link.find(
                'genius.com') != -1:
            f = open('pir.txt', 'a')
            f.write(link + '\n')
            f.close()
            print('Яма: ' + link)
        else:
            print(link)
            response = requests.get(link)
            soup = BeautifulSoup(response.text,
                                 'html.parser').find('div', class_='download')
            print(soup)
            if soup != None:
                soup = soup.__str__()
                for i in BeautifulSoup(soup,
                                       'html.parser').find_all('a', href=True):
                    wget.download(i['href'], act1.name + '_test.mp3')
                    audio = MP3(act1.name)
                    print("Track: " + audio.get("TIT2").text[0])

                    print('Lenght: ' + str(audio.info.length))
                    print('Info: ' + audio.info.pprint())

                    audio2 = MP3(act1.name + "_test.mp3")
                    print('Info: ' + audio2.info.pprint())
                    if audio2.get("TIT2") == audio.get(
                            "TIT2"
                    ) and audio2.info.length == audio.info.length and audio2.info.pprint(
                    ) == audio.info.pprint():
                        print("Это подлинный")
                        label['text'] = "Это подлинный"
                    else:
                        print('Пиратская копия')
                        label['text'] = 'Пиратская копия'
                        f = open('pir.txt', 'a')
                        f.write(link + '\n')
                        f.close()
                    print(i['href'])
    window = Tk()
    window.title("СПИСОК САЙТОВ С ПИРАТСКИМ КОНТЕНТОМ")
    window.geometry("600x150")
    window.resizable(False, False)
    f = open('pir.txt', 'r')
    for line in f.readlines():
        print(line)
    f.close()
    window.mainloop()
Пример #18
0
# print(5//2)
import requests
from bs4 import BeautifulSoup

res = requests.get(r'https://sspai.com/post/23631')
soup = BeautifulSoup(res.text, 'lxml')
print(soup.__str__())
Пример #19
0
with open('/tmp/prince.input.html', 'w') as f:
    f.write(des.format(title=title, content=content))

options = Options()
profile = webdriver.FirefoxProfile()
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')

ff = webdriver.Firefox(options=options, firefox_profile=profile)
ff.get('file:///tmp/prince.input.html')

while True:
    time.sleep(0.1)
    if ff.execute_script('return document.readyState') == 'complete':
        break

soup = BeautifulSoup(ff.page_source, 'lxml')
ff.quit()

scripts = soup.find_all('script')
for elem in scripts:
    elem.extract()

with open('output.html', 'w') as f:
    f.write(soup.__str__())

subprocess.call(['prince', 'output.html'])
os.remove('output.html')
os.remove('geckodriver.log')
Пример #20
0
def yj_html_replace(html: str, digest: str) -> str:
    """
    必要なフォーマットに変換して、htmlを返却
    """
    soup = BeautifulSoup(html, 'html5lib')
    try:
        for a in soup.find('head').find_all('script'):
            a.decompose()
        for a in soup.find('body').find_all('script'):
            a.decompose()
        for a in soup.find_all('iframe'):
            a.decompose()
        """ 2020/06/09追加 """
        if soup.find(attrs={"id":"msthd"}):
            soup.find(attrs={"id":"msthd"}).decompose()
        if soup.find(attrs={"id":"yjnHeader_nav"}):
            soup.find(attrs={"id":"yjnHeader_nav"}).decompose()
        if soup.find(attrs={"id":"uamods-also_read"}):
            soup.find(attrs={"id":"uamods-also_read"}).decompose()
        if soup.find(attrs={"id":"newsFeed"}):
            soup.find(attrs={"id":"newsFeed"}).decompose()
        if soup.find(attrs={"id":"yjSLink"}):
            soup.find(attrs={"id":"yjSLink"}).decompose()
        """ sectionの中に’関連記事’の文字列が含まれていたら削除 """
        for section in soup.find_all("section"):
            if "【関連記事】" in section.__str__():
                section.decompose()
        """ comment disable button """
        if soup.find(attrs={"class":"checkbox"}):
            soup.find(attrs={"class":"checkbox"}).decompose()
        """ 2020/06 古い削除ルールセット """   
        for key, value in [("class", "listPaneltype"), 
                            ("class", "mainYdn"), 
                            ("id", "timeline"), 
                            ("id", "yjSLink"), 
                            ("class", "ynDetailRelArticle"), 
                            ("class", "commentBox"),
                            ("id", "contentsFooter"),
                            ("id", "footer"),
                            ("id", "stream_title"), 
                            ("id", "contentsHeader"), 
                            ("id", "yjnFooter")]:
            if soup.find(attrs={key:value}):
                soup.find(attrs={key: value}).decompose()

        """ 画像の説明のhrefを消す """
        if soup.find(attrs={"class": "photoOffer"}):
            del soup.find(attrs={"class": "photoOffer"}).find("a")["href"]
        """ contents中のリンクを削除 """
        for key, value in [("id", "uamods"), ("id", "paragraph")]:
            paragraph = soup.find(attrs={key: value})
            if paragraph is None:
                continue
            for a in paragraph.find_all("a"):
                # if a.get("href"):
                #    del a["href"]
                """ a -> spanに変更 """
                a.name = "span"
        """ テキストリンクの装飾を消す """
        for a in soup.find_all(attrs={"class": "yjDirectSLinkHl"}):
            del a["class"]
        """ fontをWeb Fontの明朝に変更"""
        soup.find("head").insert(-1, BeautifulSoup('<link href="https://fonts.googleapis.com/css?family=Noto+Serif+JP:400,700&display=swap&subset=japanese" rel="stylesheet">', 'lxml'))
        soup.find("body")["style"] = "font-family: 'Noto Serif JP' !important;"

        """ javascriptによるクリック発火を抑制 """
        for a in soup.find_all("a", {"onmousedown": True}):
            del a["onmousedown"]

        """ stylesheetの一部を削除 """
        # soup.find(attrs={"href": "https://s.yimg.jp/images/jpnews/cre/article/pc/css/article_pc_v7.0.css"}).decompose()

        """ 次のページをパースして統合 """
        next_page_li = soup.find("li", attrs={"class": "next"})
        if next_page_li is None:
            next_page_li = soup.find("li", attrs={"class": "pagination_item pagination_item-next"})
        if next_page_li and next_page_li.find("span"):
            next_paragraphs: List[BeautifulSoup] = []
            get_nexts(next_page_li.find("span").get("href"), next_paragraphs)
            print("total page size", len(next_paragraphs))
            for idx, next_paragraph in enumerate(next_paragraphs):
                if soup.find(attrs={"class": "articleMain"}):
                    soup.find(attrs={"class": "articleMain"}).insert(-1, next_paragraph)
                    soup.find(attrs={"class": "articleMain"}).insert(-1, BeautifulSoup(f"""<p align="center"> Page {idx+2} </p>""", "lxml"))

                elif soup.find(attrs={"id": "uamods"}):
                    soup.find(attrs={"id": "uamods"}).insert(-1, next_paragraph)
                    soup.find(attrs={"id": "uamods"}).insert(-1, BeautifulSoup(f"""<p align="center"> Page {idx+2} </p>""", "lxml"))
                # print(next_paragraph)

            """ pageを示すフッターを消す """
            # soup.find(attrs={"class": "marT10"}).decompose()
            # soup.find(attrs={"class": "fdFt"}).decompose()
            """ page送りを消す """
            if soup.find(attrs={"class": "pagination_items"}):
                for pagination_item in soup.find_all(attrs={"class": "pagination_items"}):
                    pagination_item.decompose()
            """ footerを最後以外のものを消す """
            footers = soup.find_all("footer")
            if footers.__len__() >= 2:
                for footer in footers[:-1]:
                    footer.decompose()
            """ 次ページは:の文字を消す """
            for a in soup.find_all("a", attrs={"class": re.compile("sc-.*?")}):
                if "次ページは:" in a.__str__():
                    a.decompose()
            """ remove headers without head """
            for header in soup.find_all("header")[2:]:
                header.decompose()
        """ もとURLを挿入 """
        original_url = soup.find("meta", attrs={"property": "og:url"}).get("content")
        if soup.find(attrs={"class": "contentsWrap"}):
            soup.find(attrs={"class": "contentsWrap"}).insert(-1, BeautifulSoup(f"""<a href="{original_url}"><p align="center">オリジナルURL</p></a>""", "lxml"))
        # paragraph.find("a", {"class":None, "href":True}).decompose()
    except Exception as exc:
        tb_lineno = sys.exc_info()[2].tb_lineno
        print(f'[{FILE}] decompose error, exc = {exc}, tb_lineno = {tb_lineno}', file=sys.stderr)

    print(f'[{FILE}] accessing to {TOP_DIR}/var/YJ/comments/{digest}', file=sys.stdout)
    comment_html = ''
    comment_html_below = ''
    fns = sorted(glob.glob(f'{TOP_DIR}/var/YJ/comments/{digest}/*.pkl'))
    if len(fns) == 0:
        comment_html = '誰もコメントしていません'
    else:
        # last is lastest comment
        fn = fns[-1]
        with open(fn, 'rb') as fp:
            try:
                comments: YJComment = pickle.load(fp)
            except EOFError as exc:
                tb_lineno = sys.exc_info()[2].tb_lineno
                print(f"[{FILE}] exc = {exc}, tb_lineno = {tb_lineno}", file=sys.stderr)
                Path(fn).unlink()
                comments = []
        for comment in list(reversed(sorted(comments, key=lambda x: x.ts)))[:20]:
            tmp = f'''<div class="comment">
                        <div class="username">😃{comment.username}</div>
                        <div class="text">{comment.comment}</div>
                        <div class="ts-view" style="font-size:xx-small;text-align:right;">{comment.ts}</div>
                        <div class="good-bad">👍x{comment.good} 👎x{comment.bad}</div>
                    </div><br>'''
            comment_html += tmp

        for comment in list(reversed(sorted(comments, key=lambda x: x.ts)))[20:]:
            tmp = f'''<div class="comment">
                        <div class="username">😃{comment.username}</div>
                        <div class="text">{comment.comment}</div>
                        <div class="ts-view" style="font-size:xx-small;text-align:right;">{comment.ts}</div>
                        <div class="good-bad">👍x{comment.good} 👎x{comment.bad}</div>
                    </div><br>'''
            comment_html_below += tmp
    """
    1. ログインしたユーザのコメント等も乗せる
    2. コメント欄も表示
    3. {TOP_DIR}/var/YJ/YJComment/{digest} に時系列の名前で、json形式で入っている
    """
    this_site_comments = ""
    for fn in reversed(sorted(glob.glob(f"{TOP_DIR}/var/YJ/YJComment/{digest}/*"))):
        obj = json.load(open(fn))
        tmp = f'''<div class="comment">
                    <div class="username">😃{obj["screen_name"]}</div>
                    <div class="text">{obj["YJComment"]}</div>
                    <div class="ts-view" style="font-size:xx-small;text-align:right;">{obj["datetime"]}</div>
                    <div class="good-bad">👍x{0} 👎x{0}</div>
                </div><br>'''
        this_site_comments += tmp

    try:
        # print(soup)
        if soup.find("div", {"id":"sub"}) is not None:
            target_id = "sub"
        else:
            target_id = "yjnSub"
        with open(f"{HOME}/tmp", "w") as fp:
            fp.write(soup.__str__())
        soup.find('div', {'id': target_id}).string = ''
        soup.find('div', {'id': target_id}).insert(1, BeautifulSoup(comment_html, 'html5lib'))
        
        if soup.find(attrs={"id": "contentsWrap"}):
            target_id = "contentsWrap"
        else:
            target_id = "main"
        soup.find('div', {'id': target_id}).append(BeautifulSoup(get_form_html(digest), 'html5lib'))
        soup.find('div', {'id': target_id}).append(BeautifulSoup(this_site_comments, 'html5lib'))
        soup.find('div', {'id': target_id}).append(BeautifulSoup(comment_html_below, 'html5lib'))
    except Exception as exc:
        tb_lineno = sys.exc_info()[2].tb_lineno
        print(f'[{FILE}] exc = {exc}, tb_lineno = {tb_lineno}', file=sys.stderr)
        return f"[{FILE}] Cannnot handle this page, exc = {exc}, tb_lineno = {tb_lineno}"
    return str(soup)
Пример #21
0
    def get_links_from_manga_page(self):
        if not os.path.exists("Links"):
            os.mkdir("Links")

        if self.__manga_page_link is None:
            raise ValueError(
                "panggil set_manga_page_link(manga_page_link) dulu")

        r = requests.get(self.__manga_page_link,
                         headers=header,
                         timeout=10,
                         stream=True)
        content = r.text
        banner_link_image = ''
        links = {}
        if "komikcast" in self.__manga_page_link:
            span_elements = BS(content,
                               'html.parser').findAll('span',
                                                      {'class': 'leftoff'})
            a_elements = BS(span_elements.__str__(),
                            'html.parser').find_all('a')
            for a in a_elements:
                if 'end' in a.text.lower():
                    links[a.text + ' END'] = a.attrs['href']
                else:
                    links[a.text] = a.attrs['href']
            banner_link_image = BS(content, 'html.parser').find(
                'img', {
                    'class': 'attachment-post-thumbnail'
                }).attrs['src']
        elif "komikgue" in self.__manga_page_link:
            a_elements = BS(content, 'html.parser').findAll(
                'a', {'style': 'text-decoration:none;'})
            span_elements = BS(a_elements.__str__(),
                               'html.parser').find_all('span')
            for a, span in zip(a_elements, span_elements):
                if 'end' in a.text.lower():
                    links["Chapter {} END".format(span.text)] = a.attrs['href']
                else:
                    links["Chapter {}".format(span.text)] = a.attrs['href']
            banner_link_image = \
                BS(content, 'html.parser').find('img', {'class': 'img-responsive', 'itemprop': 'image'}).attrs['src']
        elif "komikone" in self.__manga_page_link:
            span_elements = BS(content,
                               'html.parser').findAll('span',
                                                      {'class': 'lchx'})
            a_elements = BS(span_elements.__str__(),
                            'html.parser').find_all('a')
            for a in a_elements:
                if 'end' in a.text.lower():
                    links[a.text + ' END'] = a.attrs['href']
                else:
                    links[a.text] = a.attrs['href']
            banner_link_image = BS(content, 'html.parser').find(
                'img', {
                    'class': 'attachment-post-thumbnail'
                }).attrs['src']
        elif "mangazuki" or "yomanga" in self.__manga_page_link:
            li_elements = BS(content, 'html.parser').find_all(
                'li', {'class': 'wp-manga-chapter'})
            a_elements = BS(li_elements.__str__(), 'html.parser').find_all('a')
            a_elements = [a for a in a_elements if a.text]
            for a in a_elements:
                chapter_number = re.search(regex_number, a.text).group(0)
                lin = a.attrs['href']
                if "style=list" not in lin:
                    lin += "?style=list"

                if 'end' in a.text.lower():
                    links["Chapter {} END".format(chapter_number)] = lin
                else:
                    links["Chapter {}".format(chapter_number)] = lin
            try:
                banner_link_image = BS(content, 'html.parser').find(
                    'div', {
                        'class': 'summary_image'
                    }).__str__()
                banner_link_image = BS(banner_link_image,
                                       'html.parser').find('img')
                if banner_link_image is not None:
                    banner_link_image = banner_link_image.attrs['data-src']
                else:
                    banner_link_image = BS(content, 'html.parser').find(
                        'div', {'class': 'c-blog__thumbnail'})
                    banner_link_image = banner_link_image.find('img')
                    banner_link_image = banner_link_image.attrs['data-src']
            except Exception:
                pass
        else:
            raise RuntimeError("Sumber manga tidak didukung")

        if banner_link_image:
            if not os.path.exists(self.__main_dir + self.__manga_name):
                os.mkdir(self.__main_dir + self.__manga_name)
            file_banner = self.__main_dir + self.__manga_name + "/1." + self.__manga_name + ".jpg"
            if not os.path.isfile(file_banner):
                result, url_image_banner, content = urlretrieve(
                    banner_link_image)
                if result:
                    with open(file_banner, 'wb') as f:
                        f.write(content)
            # im = PIL.Image.open(file_banner+".jpg")
            # icon_sizes = [(16, 16), (24, 24), (32, 32), (48, 48), (64, 64), (128, 128), (255, 255)]
            # im.save(file_banner+".ico", sizes=icon_sizes)

        links = sorted(links.items(), key=cmp_to_key(sort))
        self.__link = links
        with open(self.manga_dir + "chapters.json", 'w') as f:
            json.dump(links, f, indent=4)
Пример #22
0
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)


raw_html = simple_get('https://www.davidjaybuckley.com')
# print(raw_html)
html = BeautifulSoup(raw_html, 'html.parser')
f = open("output.html", "w")
f.write(html.__str__())
# print(html)
for i, row in enumerate(html.select('.students div.row')):
    print(i, row.text)
Пример #23
0
import urllib.parse
import urllib.request
from bs4 import BeautifulSoup

scrapeUrl = "https://weixin.sogou.com/weixin?p=01030402&query=%E6%A5%BC%E5%B8%82%E5%8F%82%E8%80%83&type=2&ie=utf8"
req = urllib.request.Request(scrapeUrl)
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
response = urllib.request.urlopen(req)
html = response.read()

bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.__str__())
Пример #24
0
 
 #設置最舊頁碼 #1
 page_S = div[0].select('a')[0].get('href').strip()[div[0].select('a')[0].get('href').find('index') +5: div[0].select('a')[0].get('href').find('.')]
 #設置第二新頁碼
 page_E = div[0].select('a')[1].get('href').strip()[div[0].select('a')[1].get('href').find('index') +5: div[0].select('a')[1].get('href').find('.')]
 print (page_S)
 category_end = 0
 
 for tnum in range(int(page_E)+1,int(page_S)-1,-1):
     print ("頁數: "+str(tnum))
     res = rs.get('https://www.ptt.cc/bbs/'+website+'/index'+str(tnum)+'.html')
     outer_soup = BeautifulSoup(res.text, 'html.parser')
     div = outer_soup.findAll('div', {'class': 'r-ent'})
     
     #檢查排除熱門區塊的熱門文章
     page_span = re.sub('<div class=.r-list-sep.>(.*\n.*)*', '', outer_soup.__str__())
     gerneral_tile_num = page_span.count('r-ent')
     
     for item in div:
         print (gerneral_tile_num)
         #熱門文章不在,不在首頁做存取
         if gerneral_tile_num == 0:
             break
         gerneral_tile_num = gerneral_tile_num - 1
         
         #當有刪除文章的情況,跳過
         if u'刪除)' in item.findAll('div', {'class': 'title'})[0].text or '-' == item('div', {'class': 'author'})[0].text:
             #u'[公告]' in item.findAll('div', {'class': 'title'})[0].text or\
             #u'[公告]' in item.findAll('div', {'class': 'title'})[0].text or\
             continue
         
Пример #25
0
def tweet_hyoron_(day_name: str, digest: str) -> str:
    """
    Args:
        - day_name: digestは<day_name>のフォルダごとに分類されている(冗長かもしれない)
        - digest: Twitterの評論対象のdigest
    Returns:
        - html: HTML
    POSTs:
        - TweetComment: str
    """
    if request.method == 'POST':
        obj = request.form
        if obj.get("TweetComment"):
            TweetComment = obj["TweetComment"]
            out_dir = f"{TOP_DIR}/var/Twitter/TweetComment/{digest}"
            Path(out_dir).mkdir(exist_ok=True, parents=True)
            now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            with open(f"{out_dir}/{now}", "w") as fp:
                if twitter.authorized:
                    json.dump({"TweetComment": TweetComment, "datetime": now, "screen_name": twitter.token["screen_name"]}, fp, ensure_ascii=False)
                else:
                    json.dump({"TweetComment": TweetComment, "datetime": now, "screen_name": "名無しちゃん"}, fp, ensure_ascii=False)
    head = '<html><head><title>Twitter評論</title></head><body>'
    body = ''
    with open(f'{TOP_DIR}/var/Twitter/tweet/{day_name}/{digest}') as fp:
        html = fp.read()
    soup = BeautifulSoup(html, features='lxml')
    div = soup.find('body').find('div')
    if div.find(attrs={'class': 'EmbeddedTweet'}):
        div.find(attrs={'class': 'EmbeddedTweet'})["style"] = "margin: 0 auto; margin-top: 30px;"
    imagegrids = soup.find_all('a', {'class': 'ImageGrid-image'})
    for imagegrid in imagegrids:
        src = imagegrid.find('img').get('src')
        imagegrid['href'] = src
    mediaassets = soup.find_all('a', {'class': 'MediaCard-mediaAsset'})
    for mediaasset in mediaassets:
        if mediaasset.find('img') and mediaasset.find('img').get('alt') != 'Embedded video':
            mediaasset['href'] = mediaasset.find('img').get('src')

    tweetCommentSubmitContainer = BeautifulSoup(soup.find(attrs={"class": "SandboxRoot"}).__str__(), "lxml")
    tweetCommentSubmitContainer.find(attrs={"class": "Tweet-body"}).clear()
    tweetCommentSubmitContainer.find(attrs={"class": "CallToAction"}).clear()
    comment_html = f"""
    <form action="/TweetHyoron/{day_name}/{digest}" class="form" method="post" style="position: relative;"><textarea value="コメント" name="TweetComment" rows="5" id="TweetComment" style="width: 90%; margin: 0 auto; margin-left:5%; margin-top: 5px;" ></textarea><br/>
    <input type="submit" name="TweetSubmit" value="Submit" style="-webkit-appearance: none;-webkit-border-radius: 4px;-moz-border-radius: 4px;-ms-border-radius: 4px;-o-border-radius: 4px;border-radius: 4px;-webkit-background-clip: padding;-moz-background-clip: padding;margin: 0;padding: 3px 10px;text-shadow: white 0 1px 1px;text-decoration: none;vertical-align: top;width: auto; margin-right: 10%; margin-left:80%;">
    </from>
    """
    tweetCommentSubmitContainer.find(attrs={"class": "EmbeddedTweet-tweetContainer"}).insert(-1, BeautifulSoup(comment_html, "lxml"))
    tweetCommentSubmitContainer.find(attrs={"class": "TweetAuthor-name"}).string = f"コメントする"
    tweetCommentSubmitContainer.find(attrs={"class": "TweetAuthor-screenName"}).string = f"@concertion"
    for a in tweetCommentSubmitContainer.find_all("a", {"class": "Tweet-header", "href": True}):
        del a["href"]
        a.name = "p"
    tweetCommentSubmitContainer.find(attrs={"class": "Avatar"})["src"] = "https://pm1.narvii.com/6923/51394fd5f6e385f59bb51efa0f409e253e718a69r1-2048-1536v2_00.jpg"


    buzz_css = soup.find('body').find('style').__str__() if soup.find('body').find('style') else ""
    """
    Tweetのコメントをパース
    TODO: 要デザイン
    TODO: 要外だし
    """
    comments = []
    for fn in reversed(sorted(glob.glob(f'{TOP_DIR}/var/Twitter/TweetComment/{digest}/*'))):
        try:
            obj = json.load(open(fn))
            comment = f'''<div class="TweetComment">
                <p>{obj["screen_name"]}</p><br/>
                <p>{obj["datetime"]}</p><br/>
                <p>{obj["TweetComment"]}</p><br/>
            </div>'''
            comments.append(comment)
        except Exception as exc:
            print(f"[{FILE}] exc = {exc}", file=sys.stderr)
            Path(fn).unlink()
    other_comments_html = "".join(comments)
    body += div.__str__() + buzz_css + tweetCommentSubmitContainer.__str__() + buzz_css + other_comments_html
    tail = '</body></html>'
    html = head + body + tail
    return html
Пример #26
0
    elem = elem.contents[0]
    link = ("https://www.google.com" + elem["href"])
    if link.find('music.yandex.ru') != -1:
        print('Его нельзя: ' + link)
    elif link.find('youtube') != -1:
        print('Его нельзя: ' + link)
    elif link.find('text-lyrics.ru') != -1:
        print('Яма: ' + link)
    else:
        print(link)
        response = requests.get(link)
        soup = BeautifulSoup(response.text,
                             'html.parser').find('div', class_='download')
        print(soup)
        if soup != None:
            soup = soup.__str__()
            for i in BeautifulSoup(soup, 'html.parser').find_all('a',
                                                                 href=True):
                wget.download(i['href'], 'Oxxymiron_where_test.mp3')
                audio = MP3("Oxxymiron_where_test.mp3")
                print("Track: " + audio.get("TIT2").text[0])

                #try:print("Text: " + audio.get("USLT"))
                #except AttributeError: print('Нет текста')
                print('Lenght: ' + str(audio.info.length))
                print('Info: ' + audio.info.pprint())

                audio2 = MP3("Oxxymiron_where.mp3")
                if audio2.get("TIT2") == audio.get(
                        "TIT2"
                ) and audio2.info.length == audio.info.length and audio2.info.pprint(
Пример #27
0
def reflect_html(key: int, day: str, digest: str) -> Union[None, bool]:
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    """
    1. すでに処理したファイルが存在していたらスキップ
    """
    out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}"
    if Path(out_filename).exists():
        return True
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("window-size=1024x1024")
    options.add_argument(
        f"user-data-dir=/tmp/{FILE.replace('.py', '')}_{key:06d}")
    options.binary_location = shutil.which("google-chrome")
    try:
        driver = webdriver.Chrome(executable_path=shutil.which("chromedriver"),
                                  options=options)
        driver.get(f"http://localhost/twitter/input/{day}/{digest}")
        print('ebug', f"http://localhost/twitter/input/{day}/{digest}")
        html = driver.page_source
        time.sleep(5)
        html = driver.page_source
        driver.save_screenshot(f"/home/gimpei/{digest}.png")
        driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
        # elm = driver.find_element_by_xpath("/html")
        time.sleep(1)
        inner_html = driver.page_source
        # print("inner", inner_html)

        # inner_html = driver.page_source
        # print(html)
        """get shadow-root"""
        # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""")
        # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""")
        # inner_html = elm.get_attribute("innerHTML")
        cleaner = Cleaner(style=True,
                          links=True,
                          add_nofollow=True,
                          page_structure=False,
                          safe_attrs_only=False)
        # print(inner_html)
        soup = BeautifulSoup(inner_html, "lxml")
        imported_csses = [
            el for el in soup.find_all("style", {"type": "text/css"})
        ]

        # replace css text to local css
        for css in imported_csses:
            if "@import url" in css.text:
                css_url = re.search(r'url\("(.*?)"\)', css.text).group(1)
                css_digest = GetDigest.get_digest(css_url)
                # print(css_url, css_digest)
                with requests.get(css_url) as r:
                    css_text = r.text
                Path(f"{TOP_DIR}/var/Twitter/css").mkdir(exist_ok=True,
                                                         parents=True)
                with open(f"{TOP_DIR}/var/Twitter/css/{css_digest}",
                          "w") as fp:
                    fp.write(css_text)
                css.string = f'@import url("/twitter/css/{css_digest}")'

        # replace image src
        for img in soup.find_all(attrs={"src": True}):
            url = img.get("src")
            o = urlparse(url)
            if o.scheme == "":
                o = o._replace(scheme="https")
            url = o.geturl()

            url_digest = GetDigest.get_digest(url)
            if "format=jpg" in url or re.search(".jpg$", url) or re.search(
                    ".jpeg$", url) or re.search(".JPG$", url):
                with requests.get(url, timeout=30) as r:
                    binary = r.content
                Path(f"{TOP_DIR}/mnt/twitter_jpgs").mkdir(exist_ok=True,
                                                          parents=True)
                with open(f"{TOP_DIR}/mnt/twitter_jpgs/{url_digest}",
                          "wb") as fp:
                    fp.write(binary)
                # print(f"downloaded! {TOP_DIR}/mnt/twitter_jpgs/{url_digest}")
                img["src"] = f"/twitter/jpgs/{url_digest}"
            elif "format=png" in url or re.search(".png$", url):
                with requests.get(url, timeout=30) as r:
                    binary = r.content
                Path(f"{TOP_DIR}/var/Twitter/pngs").mkdir(exist_ok=True,
                                                          parents=True)
                with open(f"{TOP_DIR}/var/Twitter/pngs/{url_digest}",
                          "wb") as fp:
                    fp.write(binary)
                img["src"] = f"/twitter/pngs/{url_digest}"
            elif "normal" in url or ".js" in url or ".svg" in url:
                continue
            else:
                continue
                # raise Exception(f"unsupported image! url={url}")
        """adhoc style edit"""
        if soup.find(attrs={"class": "EmbeddedTweet"}):
            soup.find(attrs={"class": "EmbeddedTweet"
                             })["style"] = "margin: 0 auto; margin-top: 150px;"

        out_dir = f"{TOP_DIR}/var/Twitter/tweet/{day}"
        Path(out_dir).mkdir(exist_ok=True, parents=True)
        with open(f"{out_dir}/{digest}", "w") as fp:
            fp.write(soup.__str__())
        driver.close()
        # if E.get("DEBUG"):
        print(
            f"[{NAME}] ordinally done, day = {day} digest = {digest}, filename = {out_dir}/{digest}"
        )
    except Exception as exc:
        tb_lineno = sys.exc_info()[2].tb_lineno
        print(
            f"[{NAME}] exc = {exc}, tb_lineno = {tb_lineno}, day = {day}, digest = {digest}, filename = {out_filename}",
            file=sys.stderr)
        out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}"
        Path(f"{TOP_DIR}/var/Twitter/tweet/{day}").mkdir(exist_ok=True,
                                                         parents=True)
        # パースに失敗したやつを無視する時、有効にする
        # Path(out_filename).touch()
        time.sleep(5)
        return None
    return f"/twitter/tweet/{day}/{digest}"
Пример #28
0
from bs4 import BeautifulSoup
from urllib.request import urlopen as ureq
import sys

# Wikipedia
url = "https://en.wikipedia.org/wiki/Main_Page"

#sending request to wikipedia

client = ureq(url)
page = client.read()
client.close()  #connection closed

#parsing the page
page_soup = BeautifulSoup(page, 'html.parser')
_page_str_ = page_soup.__str__()
print(len(_page_str_))
count = 0
final_str = ''
for c in _page_str_:
    try:
        print(c)
        final_str = final_str + c
    except UnicodeEncodeError:
        print(" Cannot Print this  ")
    count = count + 1
print(final_str)
Пример #29
0
    pex12.insert_after(pex13)
    pex13.insert_after(pex14)
    pex14.insert_after(pex15)
    pex15.insert_after(pex16)
    pex16.insert_after(pex17)
    pex17.insert_after(pex18)
    pex18.insert_after(pex19)
    pex19.insert_after(pex20)
    pex20.insert_after(pex21)
    pex21.insert_after(pex22)
    pex22.insert_after(pex23)
    pex4.insert_after(pex24)

    rm_me = soup.find("div", {"class": "loading-indicator"})
    rm_me.extract()

    # output_html = soup.prettify()
    output_html = soup.__str__()

# Create unique ID
full_identifier_string = "{}_{}".format(
    os.path.basename(orig_html_file).split('.')[0],
    datetime.now().strftime("%d/%m/%Y-%H:%M:%S"))
unique_id = hashlib.md5(full_identifier_string.encode('utf-8')).hexdigest()

# Write files
with open(os.path.join(orig_html_dir, 'index.html'), 'w') as ofile:
    ofile.write(output_html)
with open(os.path.join(orig_html_dir, 'unique_id.txt'), 'w') as ofile:
    ofile.write(unique_id)
Пример #30
0
path = 'C:\\Users\\User\\Desktop\\webservv0'
# Получаем GET параметр nick переданный из HTML формы
form = cgi.FieldStorage()
password = form.getfirst("password", "123123")
#password = '******'
if password == '123123':

    a = list()
    n = 0
    login = '******'
    user_secret = ''
    # org_id = ''
    auth_html = s.get('' + login + '&pass='******'html.parser')
    #token = ''
    token = token.__str__().replace('"', '')
    print('token = ', token)
    f = open(path + '\\cgi-bin\\mats\\tokenlogs.txt', 'a', encoding='UTF-8')
    f.write(str(token) + '\n')
    f.close()
    auth_html = s.get('' + token)
    soupez = BS(auth_html.text, 'html.parser')
    # print('Сотрудники = ', soupez)
    f = open(path + '\\cgi-bin\\mats\\sotrudniki.xml', 'w',
             encoding='UTF-8')  # Вытаскивание из списка сотрудников
    f.write(str(soupez))  # Имя официанта и его ид
    f.close()
    f = open(path + '\\cgi-bin\\mats\\db.xml', 'w', encoding='UTF-8')
    f.write('<?xml version="1.0" encoding="UTF-8"?>\n' + '<r>')
    tree = ET.parse(path + '\\cgi-bin\\mats\\sotrudniki.xml')
    root = tree.getroot()
Пример #31
0
class PageWork:
    def __init__(self):
        # 获取当前页面(目前市面上好像都是这么做的,先保存到手机,再获取)
        self.update()

    def tapNode(self, text='', name='node', attrs={}):
        cooridinate = self.getPos(text, name, attrs)
        if cooridinate:
            cmd = 'adb shell input tap ' + str(cooridinate[0]) + ' ' + str(
                cooridinate[1])
            os.system(cmd)
        else:
            raise RuntimeError('未能找到指定元素')
        self.update()

    def slide(self, direct=1, len=100, time=50):
        wsize = self.getWindowSize()
        midx = wsize[0] // 2
        midy = wsize[1] // 2
        if direct == 1:
            nextx, nexty = str(midx), str(midy + len)
        elif direct == 2:
            nextx, nexty = str(midx), str(midy - len)
        elif direct == 3:
            nextx, nexty = str(midx - len), str(midy)
        elif direct == 4:
            nextx, nexty = str(midx + len), str(midy)
        else:
            nextx, nexty = str(midx), str(midy)
        cmd = 'adb shell input swipe {} {} {} {} {}'.format(
            str(midx), str(midy), nextx, nexty, time)
        os.system(cmd)

    def getPos(self, text='', name='node', attrs={}):
        attrs['text'] = text
        cell = self.content.find(name, attrs=attrs)
        if cell:
            posstr = cell.attrs.get('bounds')
            match = re.search(
                r'\[(?P<lt0>\d+),(?P<lt1>\d+)\]\[(?P<rb0>\d+),(?P<rb1>\d+)\]',
                posstr)
            point_lt = [int(match.group('lt0')), int(match.group('lt1'))]
            point_rb = [int(match.group('rb0')), int(match.group('rb1'))]
            centerpos = [(point_lt[0] + point_rb[0]) // 2,
                         (point_lt[1] + point_rb[1]) // 2]
            return centerpos
        else:
            return None

    def update(self):
        os.popen('adb shell uiautomator dump /sdcard/ui.xml').read()
        # popen是异步的,在执行的同时会开启下面,如果下面调用了上面的结果,可能会出问题
        # read()是在主进程操作的,这就相当于进行了阻塞,必须等popen结束了之后才能read(),保护了下方代码执行的安全性
        # os.system('adb shell uiautomator dump /sdcard/ui.xml')
        os.popen(
            r'adb pull /sdcard/ui.xml E:\Workplace\Workplace_Python\wp_project\pyGreat\application\ctrlmobile'
        ).read()
        with open(
                r'E:\Workplace\Workplace_Python\wp_project\pyGreat\application\ctrlmobile\ui.xml',
                encoding='utf8') as f:
            self.content = BeautifulSoup(f.read(), 'lxml')

    def getWindowSize(self):
        cmd = 'adb shell wm size'
        res_size = os.popen(cmd).read()
        match = re.search(r'size: (?P<width>\d+)x(?P<height>\d+)', res_size)
        return (int(match.group('width')), int(match.group('height')))

    def save(self, path):
        with open(path, 'w') as f:
            f.write(self.content.__str__())
def extractBios():
    global job_list
    global user_bio
    global job_list_lock
    global user_lock_list
    global user_lock_list_lock

    while True:
        time.sleep(random.randint(100,300))
        
        job_list_lock.acquire()
        if len(job_list) == 0:
            job_list_lock.release()
            return
        category, url = job_list.pop()
        job_list_lock.release()
        logging.info('grilling category-url pair: %s %s', category, url)

        base_url = url
        # Now go through all the users of the category and Store for each user their twitter handle and bio
        try:
            while(True):
                time.sleep(random.randint(0,50))
                # Get the web page
                # headers = {'User-agent':'Mozilla/5.0'}
                r = requests.get(url, timeout=20)
                logging.debug("Url request successful for url: %s", url)
                # Extract information from webpage. All the users are in
                # the div with class 'search-cat-user', and the first <a>
                # tag in the div has the twitter user handle as the href
                # field. Add tags and super-tags accordingly (super-tags
                # for each category are stored in the global dictionary).
                print r.status_code
                soup = BeautifulSoup(r.text)  
                all_users = soup.find_all(attrs={"class":"search-cat-user"})                            
                for user in all_users:                                                                  
                    try:                                                                                
                        user_id = user.find('a')['href'][1:]                                            
                        if (user_id == '') or (user_id in user_bio):                                                               
                            continue                                                                    
                        logging.debug('Operating on user: %s', user_id)
                        # Get user bio now
                        user_soup = BeautifulSoup(user.__str__())
                        [s.extract() for s in user_soup.find_all(attrs={'class':'search-cat-user-name'})]
                        found = user_soup.find(attrs={'class':'search-cat-user-bio'})
                        found = BeautifulSoup(found.__str__())
                        bio = re.compile('\n').sub('', found.string)

                        if user_id not in user_lock_list:
                            user_lock_list_lock.acquire()
                            # Test again if it hasn't already been added by another thread
                            if user_id not in user_lock_list:
                                user_lock_list[user_id] = threading.Lock()
                                logging.debug("Creating lock for user: %s", user_id)
                            user_lock_list_lock.release()

                        user_lock_list[user_id].acquire()
                        user_bio[user_id] = bio.strip().encode('ascii', 'ignore')
                        logging.info('user_id:%s bio:%s', user_id, bio.strip().encode('ascii', 'ignore'))
                        print 'user_id:', user_id, 'bio', bio.strip().encode('ascii', 'ignore')
                        user_lock_list[user_id].release()

                    except:                              
                        logging.error("Error in: %s", user.find('a'))                                              
                        logging.error(traceback.format_exc())
                        # Release any unreleased locks: if already released, would through an exception - hence the try-catch blocks
                        try:
                            user_lock_list_lock.release()
                        except:
                            pass
                        try:
                            user_lock_list[user_id].release()
                        except:
                            pass
        
                # When all the users of a page are done, go to the next page and next till you reach the last page, which is verfied from the information at the bottom of the page. 
                pagination = soup.find(attrs={"class":"pagination"})
                last_page = int(pagination.find(attrs={'title':'Last Page'})['href'].split('/')[-1])
                current_page = int(pagination.find(attrs={'class':'current'})['href'].split('/')[-1])
                logging.debug("Finished operating on url: %s", url)
                if current_page == last_page:
                    logging.info("Last page operation finished for CATEGORY: %s", category)
                    break
                url = base_url+ '/page/' + str(current_page+1)
                logging.debug("Starting on the next page url: %s", url)
        except:
            logging.error(traceback.format_exc())
Пример #33
0
path = '../../data/data/'
out_file = open('../../data/corpus.txt', 'a', encoding='utf8')
filelist = os.listdir(path)  # 该文件夹下所有的文件(包括文件夹)
print(filelist)
for files in filelist:  # 遍历所有文件
    if not fnmatch.fnmatch(files, '*.html'):
        continue
    print(path + files)
    f = open(path + files, 'r+', encoding='utf8')
    txt = f.read()
    soup = BeautifulSoup(txt)
    for s in soup('script'):
        s.extract()
    for s in soup('style'):
        s.extract()
    txt = soup.__str__()
    reg2 = re.compile('<[^>]*>')
    txt = reg2.sub('', txt)
    reg3 = re.compile('-->')
    txt = reg3.sub('', txt)
    reg4 = re.compile('&(\S)?gt')
    txt = reg4.sub('', txt)
    reg5 = re.compile('New!')
    txt = reg5.sub('', txt)
    ans = txt.split("\n")
    print(ans)
    for word in ans:
        if not word == '':
            out_file.write(word)
            out_file.write('\n')
        if word == '法律声明':
Пример #34
0
 def saveBlog(self, destHtml, destPdf, realNamePdf, realNameHtml,
              artical_url):
     htmlContent = self.fixSynaxHighLighter(
         self.login(artical_url, referer=artical_url))
     if htmlContent == False:  # 针对隐藏页面消失的跳过处理
         return
     soup = BeautifulSoup(htmlContent, features='html5lib')
     self.print("处理图片中......")
     # 处理图片为BASE64
     imgSoup = soup.findAll(name="img")
     id = 0
     for img in imgSoup:
         if not str(img['src']).__contains__('file:'):
             if str(img['src']).startswith('//'):
                 img['src'] = 'https' + img['src']
             id = id + 1
             self.login(img['src'],
                        isImage=True,
                        imageId=str(self.identifier) + '_' + str(id))
             img['src'] = "data:image/jpeg;base64," + self.get_image_file_as_base64_data(
                 self.blogDir + self.bName + self.blogImage +
                 str(self.identifier) + '_' + str(id) +
                 '.jpg').decode('UTF-8')
             try:
                 del (img['width'])
                 del (img['height'])
             except:
                 pass
     # 处理公式为BASE64
     self.print("处理公式中......")
     ind = 0
     for (typename, typefsize) in {
             'math/tex': '16px',
             'math/tex; mode=display': '26px'
     }.items():
         latexsoup = soup.findAll(name="script", attrs={"type": typename})
         for l in latexsoup:
             ind = ind + 1
             if ind % 10 == 0:
                 time.sleep(10)
             formulatext = "$$" + str(l.text) + "$$"
             formulatext = formulatext.replace(' ', '')
             all_url = 'http://quicklatex.com/latex3.f'
             Para = {
                 'formula': formulatext,
                 'fsize': typefsize,
                 'fcolor': '000000',
                 'mode': '0',
                 'out': '1',
                 'remhost': 'quicklatex.com',
                 'rnd': random.uniform(0, 100)
             }
             start_html = requests.post(all_url, data=Para)
             img_url = start_html.text.replace("\r\n", " ")
             img_url = img_url.split(' ')
             img_url = img_url[1]
             time.sleep(1.0)
             img = requests.get(img_url)
             f = open(
                 self.blogDir + self.bName + "images\\" + 'formula_' +
                 str(self.identifier) + '_' + str(ind) + '.png', 'wb')
             f.write(img.content)
             f.close()
             latex_img = "<span style=\"position: relative;\"><img style=\"MARGIN:0; PADDING:0\" src=\"data:image/png;base64," + self.get_image_file_as_base64_data(
                 self.blogDir + self.bName + "images\\" + 'formula_' +
                 str(self.identifier) + '_' + str(ind) +
                 '.png').decode('UTF-8') + "\"/></span>"
             l.insert_after(BeautifulSoup(latex_img, features='html5lib'))
     try:
         hidinginfosoup = soup.findAll(
             name="div",
             attrs={
                 "class": "hide-article-box hide-article-pos text-center"
             })[0]
         hidinginfosoup['style'] = 'display: none;'
     except:
         pass
     htmlContent = soup.__str__()
     with open(destHtml, 'w', encoding='utf-8') as f:
         f.write(htmlContent)
     self.print("正在转换为PDF......")
     # 获取页面Cookie
     http_cookie = list()
     cj = self.session.cookies.get_dict()
     for (k, v) in cj.items():
         if '%' in v:
             v = urllib.parse.unquote(v)
         http_cookie.append((k, v))
     pdfkit.from_file(
         destHtml,
         destPdf,
         options={
             'custom-header':
             [('Origin', 'https://blog.csdn.net'),
              ('Referer', 'https://blog.csdn.net'),
              ('User-Agent',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'
               )],
             'cookie':
             http_cookie,
             'enable-local-file-access':
             '',
             'images':
             ''
         })
     while True:
         isExist = os.path.exists(destPdf)
         if isExist:
             break
         time.sleep(1)
     isExist = os.path.exists(realNamePdf)
     if isExist:
         realNamePdf = realNamePdf[:-4] + str(int(time.time())) + '.pdf'
         realNameHtml = realNameHtml[:-5] + str(int(time.time())) + '.html'
     os.rename(destPdf, realNamePdf)
     os.rename(destHtml, realNameHtml)
     self.print("已保存网页: " + realNameHtml)
     time.sleep(2)
     self.print("已保存PDF: " + realNamePdf)