Пример #1
0
 def getNewlyBug(self) -> dict:
     result = {}
     productInfos = self.getProductInfo()
     param = {"charts[]": "openedBugsPerDay"}
     for id, name in productInfos.items():
         date_list = []
         value_list = []
         self.opener.open(self.bugBrowseTemp % id)
         response = self.opener.open(
             self.bugReportTemp % id,
             urllib.parse.urlencode(param).encode('utf-8'))
         soup = BeautifulSoup(response.read(),
                              "html.parser",
                              from_encoding="utf8")
         date_data = soup.find_all(name="td",
                                   attrs={'class': 'chart-label'})
         for item in date_data:
             if len(item.contents) == 1:
                 date = str(item.contents[0])
                 date_list.append(date)
         value_data = soup.find_all(name="td",
                                    attrs={'class': 'chart-value'})
         for item in value_data:
             if len(item.contents) == 1:
                 value_list.append(int(item.contents[0]))
         # date_list = date_list[-60:]
         # value_list = value_list[-60:]
         bugs_dict = dict(zip(date_list, value_list))
         result[name] = bugs_dict
         soup.clear()
     return result
Пример #2
0
def get_webflow(response):
    '''获得选课页面必须的lt 以及execution参数'''
    soup = BeautifulSoup(response.text,'html.parser')
    lt = soup.find('input',{'name' : 'lt'})['value']
    execution = soup.find('input',{'name' : 'execution'})['value']
    soup.clear()
    return(lt,execution)
Пример #3
0
def get_pois(page_url):
    try:
        poi_request = requests.get(page_url)
        poi_soup = BeautifulSoup(poi_request.text)
        tb_div = poi_soup.find_all('table', class_='table table-bordered table-striped table-hover data-table')
        # 表头
        thead_ele = tb_div[0].find_all('th')
        # 表格
        rows = tb_div[0].find_all('tr')

        data_path = r'D:\Code\gis-poi\data\poi.txt'
        data = open(data_path, 'a', encoding='utf-8')

        for row in rows:
            cells = row.find_all('td')
            # 插入式需要将特殊符号转义,此处不完整,只进行了单引号的转义
            sql = "insert into " \
                  "tb_poi(f_name,f_pname,f_cname,f_dname,f_dcode,f_tel,f_area,f_address,f_b,f_s,f_x,f_y) " \
                  "values('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}',{10},{11});".format(
                cells[1].text.replace("'", "\\'"), cells[2].text.replace("'", "\\'"),
                cells[3].text.replace("'", "\\'"), cells[4].text.replace("'", "\\'"),
                cells[5].text.replace("'", "\\'"), cells[6].text.replace("'", "\\'"),
                cells[7].text.replace("'", "\\'"), cells[8].text.replace("'", "\\'"),
                cells[9].text.replace("'", "\\'"), cells[10].text.replace("'", "\\'"),
                cells[11].text.replace("'", "\\'"), cells[12].text.replace("'", "\\'"))
            data.write(sql + "\n")
        data.close()

        poi_soup.clear()
        poi_request.close()
    except Exception as e:
        print(e)
Пример #4
0
class BS4Parser:
    def __init__(self, *args, **kwargs):
        # list type param of "feature" arg is not currently correctly tested by bs4 (r353)
        # so for now, adjust param to provide possible values until the issue is addressed
        kwargs_new = {}
        for k, v in kwargs.items():
            if 'features' in k and isinstance(v, list):
                v = [
                    item for item in v if item in
                    ['html5lib', 'html.parser', 'html', 'lxml', 'xml']
                ][0]

            kwargs_new[k] = v

        tag, attr = [
            x in kwargs_new and kwargs_new.pop(x) or y
            for (x, y) in [('tag', 'table'), ('attr', '')]
        ]
        if attr:
            args = (re.sub(
                r'(?is).*(<%(tag)s[^>]+%(attr)s[^>]*>.*</%(tag)s>).*' % {
                    'tag': tag,
                    'attr': attr
                }, r'<html><head></head><body>\1</body></html>',
                args[0]).strip(), ) + args[1:]

        self.soup = BeautifulSoup(*args, **kwargs_new)

    def __enter__(self):
        return self.soup

    def __exit__(self, exc_ty, exc_val, tb):
        self.soup.clear(True)
        self.soup = None
Пример #5
0
def post(request):
    ##########################################################
    #############新使用者先建立DIR再用這個DIR要爬的東西#############
    ##############因為我們要一個介面解決這他媽改死的教學平台##########
    ###########################################################
    if request.method == "POST":  #以下是爬蟲程式
        cID = request.POST['stuID']
        cPassword = request.POST['pwd']
        url = "http://ecampus.nqu.edu.tw/eCampus3P/Learn/LoginPage2/product_login.aspx"  #ecampus的login url
        headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
        }
        #headers還不知道怎麼抓但是我想都差不多一樣
        resp = requests.post(
            url,
            headers=headers,
            data={
                '__EVENTTARGET': '',
                '__EVENTARGUMENT': '',
                '__VIEWSTATE':
                ' /wEPDwUKMjAzODk5NzA3Mg8WAh4EX2N0bAUMYnRuTG9naW5IZWxwFgICAw9kFiYCAQ8WAh4KYmFja2dyb3VuZAUWaW1hZ2VzL3poLVRXL2xvZ2luLmdpZhYMAgEPFgIeBXN0eWxlBRpwb3NpdGlvbjpyZWxhdGl2ZTtsZWZ0OjBweBYCAgEPDxYCHghJbWFnZVVybAUTaW1hZ2VzL3poLVRXL2lkLmdpZmRkAgMPFgIfAgUacG9zaXRpb246cmVsYXRpdmU7bGVmdDowcHhkAgUPFgIfAgUacG9zaXRpb246cmVsYXRpdmU7bGVmdDowcHgWAmYPZBYCAgEPDxYCHwMFGWltYWdlcy96aC1UVy9wYXNzd29yZC5naWZkZAIHDxYCHwIFGnBvc2l0aW9uOnJlbGF0aXZlO2xlZnQ6MHB4ZAIJD2QWCAIBDw8WBh4IQ3NzQ2xhc3MFC21lbnVfdGV4dDAyHgRUZXh0BQ5b5b+Y6KiY5a+G56K8XR4EXyFTQgICZGQCAw8PFgYfBAUQbWVudV90ZXh0MDJfb190dx8FBQ5b55m75YWl6Kqq5piOXR8GAgJkZAIFDw8WBh8EBQttZW51X3RleHQwMh8FBQ5b6Kiq5a6i5Y+D6KeAXR8GAgJkZAIHDw8WCB8EBQttZW51X3RleHQwMh8FBQ5b5Y+D6KeA6Kqy56iLXR8GAgIeB1Zpc2libGVoZGQCCw8PFgIfAwUcaW1hZ2VzL3poLVRXL2xvZ2luIEVudGVyLmpwZxYEHgtvbm1vdXNlb3ZlcgU4amF2YXNjcmlwdDp0aGlzLnNyYz0naW1hZ2VzL3poLVRXL2xvZ2luIEVudGVyX292ZHcuanBnJzseCm9ubW91c2VvdXQFM2phdmFzY3JpcHQ6dGhpcy5zcmM9J2ltYWdlcy96aC1UVy9sb2dpbiBFbnRlci5qcGcnO2QCAw8PFgIfAwUTaW1hZ2VzL3poLVRXL0dCLmdpZmRkAgQPDxYCHwMFE2ltYWdlcy96aC1UVy9Fbi5naWZkZAIGDw8WAh8DBRZpbWFnZXMvemgtVFcvdGl0ZWwuanBnZGQCCA8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+ebuOmXnOmAo+e1kF0fBgICZGQCCg8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+W5s+WPsOS7i+e0uV0fBgICZGQCDA8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+W4uOimi+WVj+mhjF0fBgICZGQCDg8PFgYfBAULbWVudV90ZXh0MDIfBQUOW+mAo+e1oeaIkeWAkV0fBgICZGQCEA8PFggfBAULbWVudV90ZXh0MDIfBQUOW+eUs+iri+W4s+iZn10fBgICHwdoZGQCFA8PFgIfAwUdaW1hZ2VzL3poLVRXL21haW4gcGljdHVyZS5qcGdkZAIWDxYCHwEFH2ltYWdlcy96aC1UVy9sb2dpbiB0ZXh0IHBhbi5qcGdkAhgPDxYCHwMFFWltYWdlcy96aC1UVy9uZXdzLmpwZ2RkAhwPDxYCHwMFGmltYWdlcy96aC1UVy9mcmFtZV90b3AuZ2lmZGQCHg8WAh8BBR9pbWFnZXMvemgtVFcvbG9naW4gdGV4dCBwYW4uanBnZAIgDxYEHgZoZWlnaHQFBTI0MHB4HgNzcmMFFy4uL2xvZ2luX0hlbHBJbmRleC5hc3B4ZAIiDxYCHwEFGGltYWdlcy96aC1UVy9mcmFtZV9SLmdpZmQCJA8PFgIfAwUaaW1hZ2VzL3poLVRXL2ZyYW1lX2Rvdy5naWZkZAIoDxYEHwUFHGVDYW1wdXMgSUlJIHYxLjYuMDkxOTguMDEwNDAfB2dkAi4PDxYCHwMFH2ltYWdlcy96aC1UVy9sb2dvIG9mIDNwcm9iZS5naWZkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAwUIYnRuTG9naW4FCmJ0bkNoaW5lc2UFCmJ0bkVuZ2xpc2hqzHG9hdaHqyty7OyKa8boh3mpUA==',
                '__VIEWSTATEGENERATOR': '8B4B7C2A',
                'txtLoginId': cID,  # studentID
                'txtLoginPwd': cPassword,  # password
                'btnLogin.x': '44',  #應該填多少也沒差
                'btnLogin.y': '25',  #應該填多少也沒差
            })
        soup = BeautifulSoup(resp.text, "lxml")
        course_Name = []
        course_URL = []
        for i in soup.find_all(id=re.compile("CourseName")):
            course_Name.append(i.string)
        for i in soup.find_all(
                "input", {'url': re.compile('stu_course_default.aspx?')}):
            course_URL.append(i["url"][33:69])
        cName = soup.find(id="ctl00_lbAccount").string
        user_Dict = dict(zip(course_Name, course_URL))
        if len(user_Dict) == 0:
            return render(request, "error_login.html",
                          eooro_login=True)  #回傳到index做登入錯誤
        soup.decompose = True
        soup.clear()
        try:
            unit = userData.objects.get(cID=cID, cPassword=cPassword)
            print("有人登入拉")
        except:
            cCurrAccID = find_CurrAccID(
                "http://ecampus.nqu.edu.tw/eCampus3P/Learn/stu_course_default.aspx?CourseId="
                + course_URL[0] + "&Role=Student", cID, cPassword)
            unit = userData.objects.create(cID=cID,
                                           cPassword=cPassword,
                                           cCurrAccID=cCurrAccID[10:],
                                           cName=cName)
            unit.save()
            print("有人創帳號嘍")
        request.session["user_id"] = cID  #設定本站的session
        return render(request, "course/course.html",
                      locals())  #會跑到course.html模板
Пример #6
0
 def parseHtml(self, html):
     items = list()
     bs = BeautifulSoup(html, 'html.parser')
     for rootelement in bs.find('ul', attrs={'id': 's-results-list-atf'}):
         for ref in rootelement.find('a'):
             items.append(item(ref['alt'], ref.parent['href']))
     bs.clear(decompose=True)
     return items
Пример #7
0
def convertHTML2PDF(html_files):
    # Call sorting function
    ordered_html_files = filterAndSortHTMLfiles(html_files)

    for file in ordered_html_files:

        print(file)

        file_text = codecs.open(path + '/' + file, 'r')
        html_text = file_text.read()

        soup = BeautifulSoup(html_text, 'lxml')
        div_fhwrapper = soup.find("div", {"class": "fh-wrapper"})  # "header", "class": "border-vertical"})

        div_fhwrapper.find('div', {"class": "header"}).decompose()
        div_fhwrapper.find('div', {"class": "blank-class-outer footer visible-md visible-lg"}).decompose()
        div_fhwrapper.find('div', {"class": "footer-small visible-xs visible-sm"}).decompose()

        # adds # before link
        changeLinksInDIV(div_fhwrapper)

        div_breadcrumbs = soup.find("div", {"class": "breadcrumbs"})

        # sets id name to div
        # it is necessary for anchor links
        div_breadcrumbs.attrs['id'] = str(file)

        # decreases padding
        div_jumbotron = soup.find("div", {"class": "jumbotron"})
        div_jumbotron.attrs[
            'style'] = "padding-top: 5px !important; padding-bottom: 10px !important; margin-bottom: 0px !important;"

        # deletes empty bullets in list
        for x in div_fhwrapper.findAll('li'):
            # print(x)
            if len(x.get_text(strip=True)) == 0:
                x.extract()

        get_image_file_as_base64_data(div_fhwrapper)

        content = str(div_fhwrapper)
        outputfile.write(content)

        soup.clear()
        # print(content)

        # Close read file in this iteration
        file_text.close()

    meshfree_file = codecs.open(path + '/MESHFREE.html', 'r')
    meshfree_text = meshfree_file.read()

    meshfree_soup = BeautifulSoup(meshfree_text, 'lxml')
    div_footer = meshfree_soup.find("div", {"class": "footer-small visible-xs visible-sm"})
    changeLinksInDIV(div_footer)
    content_footer = str(div_footer)
    meshfree_file.close()
    outputfile.write(content_footer)
def RaspUFG(pag, nArq):

    arq = open('UFG-ValorMensal' + str(nArq) + '.txt', 'w')

    #For para percorrer todas 374 paginas com os servidores UFG
    for j in range(pag, min(374, pag + 100)):
        #Criacao da "sopa" com os dados da pagina atual do poral de transparencia da UFG
        html = urllib2.urlopen(
            'http://www.portaldatransparencia.gov.br/servidores/OrgaoLotacao-ListaServidores.asp?CodOS=15000&DescOS=MINISTERIO%20DA%20EDUCACAO&CodOrg=26235&DescOrg=UNIVERSIDADE%20FEDERAL%20DE%20GOIAS&Pagina='
            + str(j))
        bsObj = BeautifulSoup(html, "html.parser")

        #Obtencao dos elementos da tabela com os dados dos servidores
        tabela = bsObj.find(
            "table", {"summary": "Lista de servidores lotados por órgão"})

        teto = 20000.00

        #Obtencao dos dados dos elementos da tabela
        if tabela != None:
            tds = tabela.findAll("a")
            for link in tds:
                link2 = link.get('href')
                nome = link.contents[0]
                idServidor = link2[44:51]

                if idServidor != "":
                    html = urllib2.urlopen(
                        'http://www.portaldatransparencia.gov.br/servidores/Servidor-DetalhaRemuneracao.asp?Op=3&IdServidor='
                        + str(idServidor) +
                        '&CodOS=15000&CodOrgao=26235&bInformacaoFinanceira=True'
                    )
                    bsObj = BeautifulSoup(html, "html.parser")

                    #Obtencao dos elementos da tabela com os dados dos servidores
                    salario = bsObj.find(
                        "tr", {"class": "remuneracaolinhatotalliquida"})
                    if salario != None:
                        salarioValor = salario.find("td", {
                            "class": "colunaValor"
                        }).get_text()
                    else:
                        salarioValor = ""

                    textoVM = []

                    if salarioValor != "":
                        if (float(salarioValor.strip().replace(
                                '.', '').replace(',', '.')) >= teto):
                            print nome
                            print salarioValor

                            textoVM.append('Nome:' + nome + '\n')
                            textoVM.append('Salario:' + salarioValor + '\n')
                            arq.writelines(textoVM)

        #Limpa a "sopa"
        bsObj.clear()
Пример #9
0
    def create_raw_descs(link_inside):
        def _remove_all_attrs(text):  # removing tag attributes
            for tag in text.find_all(True):
                tag.attrs = {}
            return text

        # FORMING THE DESCRIPTIONS

        page = requests.get(link_inside)  # getting the object from url
        soup = BeautifulSoup(page.content,
                             'html.parser')  # loading it into the soup
        desc_divs = []  # a list for all descs' divs

        # the code below is a sample, do NOT paste it in your project as is
        """
        main_heading = soup.find("h1")
        main_heading.name = "div"  # change the name for uniformity
        if main_heading:
            desc_divs.append(main_heading)
        else:
            pass

        main_desc = soup.find("div", class_="product-main-text")  # main desc
        if main_desc:
            desc_divs.append(main_desc)
        else:
            pass

        features_table_desc = soup.find("div", class_="title")  # features (table heading)
        if features_table_desc:
            desc_divs.append(features_table_desc)
        else:
            pass

        features_table = soup.find("table", class_="table-striped")  # features table
        if features_table:
            desc_divs.append(features_table)
        else:
            pass

        catalog_detail_block = soup.find("div", class_="catalog_detail_info")  # text after features table
        if catalog_detail_block:
            desc_divs.append(catalog_detail_block)
        else:
            pass
        """

        soup.clear()  # clearing the old soup

        for desc_div in desc_divs:  # loading the new soup with objects from the list
            soup.append(desc_div)

        soup_without_attrs = _remove_all_attrs(
            soup)  # removing all unnecessary attrs

        return soup_without_attrs
Пример #10
0
class BS4Parser:
    def __init__(self, *args, **kwargs):
        self.soup = BeautifulSoup(*args, **kwargs)

    def __enter__(self):
        return self.soup

    def __exit__(self, exc_ty, exc_val, tb):
        self.soup.clear(True)
        self.soup = None
Пример #11
0
class BS4Parser(object):
    def __init__(self, *args, **kwargs):
        self.soup = BeautifulSoup(*args, **kwargs)

    def __enter__(self):
        return self.soup

    def __exit__(self, exc_ty, exc_val, tb):
        _ = exc_ty, exc_val, tb  # Throw away unused values
        self.soup.clear(True)
        self.soup = None
Пример #12
0
class BS4Parser(object):
    def __init__(self, *args, **kwargs):
        self.soup = BeautifulSoup(*args, **kwargs)

    def __enter__(self):
        return self.soup

    def __exit__(self, exc_ty, exc_val, tb):
        _ = exc_ty, exc_val, tb  # Throw away unused values
        self.soup.clear(True)
        self.soup = None
Пример #13
0
def data_search(r_text, s_text):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(r_text, "html.parser")
    # считываем данные
    dirty_desc = soup.find_all("div", class_="pxc-prod-detail-txt")
    dirty_short_desc = soup.h1

    # check existing short description, if none - return '0', because at phoenix site it mean no data!
    if dirty_short_desc is None:
        return '0'

    dirty_tech_data = soup.find_all("table", class_="pxc-tbl")
    # len(soup.find_all("table", class_="pxc-tbl")) can count num of tables, and split them
    # test = soup.find_all("table", class_="pxc-tbl")[0]
    soup.clear()
    # выгружаем текст
    desc = str(dirty_desc)
    soup = BeautifulSoup(desc, "html.parser")
    desc = soup.get_text()
    desc = desc.split("\n")
    # выгружаем текст
    short_desc = str(dirty_short_desc)
    soup = BeautifulSoup(short_desc, "html.parser")
    short_desc = soup.get_text()
    # выгружаем текст
    tech_data = str(dirty_tech_data)
    soup = BeautifulSoup(tech_data, "html.parser")
    tech_data = soup.get_text()
    tech_data = tech_data.split("\n")
    tech_data1 = []
    # пересобираем текстовые данные
    for a in tech_data:
        if a != '' and a != '[' and a != ']' and a != ', ':
            tech_data1.append(a)

    # загружаем данные в парсер
    soup = BeautifulSoup(s_text, "html.parser")
    # считываем данные
    dirty_comm_data = soup.find("table", class_="pxc-tbl")
    soup.clear()
    # выгружаем текст
    comm_data = str(dirty_comm_data)
    soup = BeautifulSoup(comm_data, "html.parser")
    comm_data = soup.get_text()
    comm_data = comm_data.split("\n")
    comm_data1 = []
    # пересобираем текстовые данные
    for a in comm_data:
        if a != '':
            comm_data1.append(a)

    return short_desc, desc[1], tech_data1, comm_data1
Пример #14
0
def convertHTML2PDF(html_files):
    # Call sorting function
    ordered_html_files = filterAndSortHTMLfiles(html_files)

    meshfree_file = codecs.open(path + '/MESHFREE.html', 'r')
    meshfree_text = meshfree_file.read()

    meshfree_soup = BeautifulSoup(meshfree_text, 'lxml')
    div_footer = meshfree_soup.find(
        "div", {"class": "blank-class-outer footer visible-md visible-lg"})
    changeLinksInDIV(div_footer)
    content_footer = str(div_footer)
    meshfree_file.close()

    for file in ordered_html_files:
        print(file)

        file_text = codecs.open(path + '/' + file, 'r')
        html_text = file_text.read()

        soup = BeautifulSoup(html_text, 'lxml')
        div_fhwrapper = soup.find(
            "div",
            {"class": "fh-wrapper"})  # "header", "class": "border-vertical"})

        div_fhwrapper.find('div', {"class": "header"}).decompose()
        div_fhwrapper.find(
            'div', {
                "class": "blank-class-outer footer visible-md visible-lg"
            }).decompose()

        # adds # before link
        changeLinksInDIV(div_fhwrapper)

        div_breadcrumbs = soup.find("div", {"class": "breadcrumbs"})

        # sets id name to div
        # it is necessary for anchor links
        div_breadcrumbs.attrs['id'] = str(file)

        # style = "display:block; clear:both; page-break-after:always;"

        content = str(div_fhwrapper)
        outputfile.write(content)

        soup.clear()
        # print(content)

        # Close read file in this iteration
        file_text.close()

    outputfile.write(content_footer)
Пример #15
0
def find_byid(id):
    print('start to spider to find tweets')
    print('id is ' + id)
    URL = 'https://twitter.com/%s' % id
    print(URL)
    URL_MOBILE = 'https://mobile.twitter.com'
    f = request.urlopen(URL)
    # html = f.read().encode('utf-8')
    soup = BeautifulSoup(f, 'lxml')
    find = soup.find_all(
        class_='tweet-timestamp js-permalink js-nav js-tooltip', limit=5)
    print(find)
    re_url = []
    re_text = []
    for tag in find:
        dic = tag.attrs
        url = URL_MOBILE + dic['href']
        re_url.append(url)
        # print(dic['href'][1:] +' this is ' + dic['title'])
    # print(soup)
    find = soup.find_all(
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text',
        limit=5)
    # print(find)
    for tag in find:
        # print(type(tag))
        # print(tag)
        try:
            str = tag.string
            if str == None:
                str = ''
                try:
                    for intag in tag:
                        # intag =tag[0]
                        if not intag.string == None:
                            str += intag.string
                    print(str)
                    re_text.append(str)
                except:
                    print('error to read a string in tag')
            else:
                re_text.append(str)
        except:
            print('error to read the string')
    print(re_text)
    print(re_url)
    f.close()
    soup.clear()
    re = {'pic': re_url, 'text': re_text}
    return re
Пример #16
0
 def get_webflow(self):
     """
     流水号webflow获取。随便访问包含登陆页链接的CSDN网页就可以得到这串数据。应为是动态变化的
     所以,先获取下来,以备使用。
     :return:
     """
     url = 'https://passport.csdn.net/account/login?ref=toolbar'
     response = self.session.get(url=url, headers=self.headers)
     soup = BeautifulSoup(response.text, 'html.parser')
     lt = soup.find('input', {'name': 'lt'})['value']
     execution = soup.find('input', {'name': 'execution'})['value']
     # 释放不必要的对象
     soup.clear()
     return (lt, execution)
Пример #17
0
 def get_webflow(self):
     """
     流水号webflow获取。随便访问包含登陆页链接的CSDN网页就可以得到这串数据。应为是动态变化的
     所以,先获取下来,以备使用。
     :return:
     """
     url = 'https://passport.csdn.net/account/login?ref=toolbar'
     response = self.session.get(url=url, headers=self.headers)
     soup = BeautifulSoup(response.text, 'html.parser')
     lt = soup.find('input', {'name': 'lt'})['value']
     execution = soup.find('input', {'name': 'execution'})['value']
     # 释放不必要的对象
     soup.clear()
     return (lt, execution)
Пример #18
0
def fill_cupple_tr(tr):
     for td in tr.find_all('td'):
          if is_title_td(td):
               k = td.next_sibling.next_sibling
               if k and (not is_title_td(k)):
                    if get_title_td(td).find('意见') != -1:
                         new_tag = BeautifulSoup().new_tag('textarea',id=get_title_pinyin_td(td))
                         new_tag['rows'] = 20
                         new_tag.clear()
                    else:
                         new_tag = BeautifulSoup().new_tag('input',id=get_title_pinyin_td(td))
                         new_tag['value'] = new_tag['id']
                    if k.input:
                         k.input.replace_with(new_tag)  
                    else:
                         k.append(new_tag)
Пример #19
0
def _trySuggestions(platform: str, region: int, soup: bs4.BeautifulSoup) -> bs4.BeautifulSoup:
    """
    Goes through the list of games and tries to find one that matches the platform
    :param platform: The platform we're looking for
    :param region: Which region. 0: NTSC (JP), 1: NTSC (NA), 2: PAL
    :param soup: BeautifulSoup object
    :return: BeautifulSoup object for new page if found, else a NoneType BeautifulSoup object
    """

    logger.info("Couldn't find game at url. Trying alternatives...")

    titleUrlRegex = re.compile(r'href=\".*?\"')
    titles = soup.find_all("td", {"class": "title"})
    consoles = soup.find_all("td", {"class": "console"})
    url = ""

    for title, console in zip(titles, consoles):
        if console.text.lower().replace(" ", "-") == _platforms[platform][region]:
            url = titleUrlRegex.findall(title.decode()).pop()[5:].strip('"')
            break

    if len(url) > 0:
        logger.info(f"New url found: {url}")
        res = requests.get(url)
        soup = bs4.BeautifulSoup(res.text, "html.parser")
        return soup

    logger.info("Couldn't find title in alternate urls.")
    return soup.clear()
Пример #20
0
def get_pages(url_path, city_url):
    try:
        city_request = requests.get(city_url)
        city_soup = BeautifulSoup(city_request.text)
        # 获取分页列表标签
        page_ele = city_soup.find_all('ul', class_='pagination pagination-sm mar-t5')
        page_hrefs = page_ele[0].find_all('a')
        for page_href in page_hrefs:
            if str.strip(page_href.get('href')) == '':
                continue
            page_url = url_path + page_href.get('href')
            get_pois(page_url)
        city_soup.clear()
        city_request.close()
    except Exception as e:
        print(e)
Пример #21
0
def get_catogrys(url_path):
    try:
        url_request = requests.get(url_path)
        url_soup = BeautifulSoup(url_request.text)
        # 获取分类标签  <div class="catgory"></div>
        catgory_eles = url_soup.find_all('div', class_='catgory')
        if catgory_eles is None or len(catgory_eles) == 0:
            raise Exception('未能读取到目录信息')
        for catgory_ele in catgory_eles:
            # 获取分类标签中的a标签的属性href值
            catgory_url = url_path + catgory_ele.find_all('a')[0].get('href')
            get_citys(url_path, catgory_url)
        # 使用完成后清理对象内存
        url_soup.clear()
        url_request.close()
    except Exception as e:
        print(e)
Пример #22
0
def fill_cupple_tr(tr):
    for td in tr.find_all('td'):
        if is_title_td(td):
            k = td.next_sibling.next_sibling
            if k and (not is_title_td(k)):
                if get_title_td(td).find('意见') != -1:
                    new_tag = BeautifulSoup().new_tag(
                        'textarea', id=get_title_pinyin_td(td))
                    new_tag['rows'] = 20
                    new_tag.clear()
                else:
                    new_tag = BeautifulSoup().new_tag(
                        'input', id=get_title_pinyin_td(td))
                    new_tag['value'] = new_tag['id']
                if k.input:
                    k.input.replace_with(new_tag)
                else:
                    k.append(new_tag)
Пример #23
0
def do_glosary_desc_page(url, download_dir):
    content_type = site_urls[url]['content-type']
    page_file_name = url_to_file_name(url, content_type)
    input_file_path = download_dir + page_file_name

    with open(input_file_path, 'r') as f:
        html = f.read()

    page = BeautifulSoup(html, "html5lib")
    glossary_detail = page.find("div", class_="main-glossary-detail-container")
    #page = BeautifulSoup(html, "html.parser")
    head_lines_text = '''
    <html>
    <head>
    <link href="/assets/style.css" rel="stylesheet">
    <link href="/assets/style-override.css" rel="stylesheet">
    <style>
    #currentGlossaryText
        {font-weight: 700!important;
        font-size: 18px!important;
        display: block!important;
    }
    .hidden {
        display: block!important;
    }
    </style>
    </head>
    <body>
    '''
    head_lines = BeautifulSoup(head_lines_text, 'html.parser')
    bottom_lines_text = '''
    </body>
    </html>
    '''
    bottom_lines = BeautifulSoup(bottom_lines_text, 'html.parser')
    page.clear()
    page.append(head_lines)
    page.append(glossary_detail)
    page.append(bottom_lines)

    page = fix_links(page, url)

    return page, page_file_name
Пример #24
0
    def test_search():  # pylint: disable=too-many-locals
        """
        Test searching
        """
        url = "http://kickass.to/"
        search_url = (
            "http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc"
        )

        html = getURL(search_url, session=requests.Session())
        if not html:
            return

        soup = BeautifulSoup(html, "html5lib")

        torrent_table = soup.find("table", attrs={"class": "data"})
        torrent_rows = torrent_table.find_all("tr") if torrent_table else []

        # cleanup memory
        soup.clear(True)

        # Continue only if one Release is found
        if len(torrent_rows) < 2:
            print "The data returned does not contain any torrents"
            return

        for row in torrent_rows[1:]:
            try:
                link = urlparse.urljoin(url, (row.find("div", {"class": "torrentname"}).find_all("a")[1])["href"])
                _id = row.get("id")[-7:]
                title = (row.find("div", {"class": "torrentname"}).find_all("a")[1]).text or (
                    row.find("div", {"class": "torrentname"}).find_all("a")[2]
                ).text
                url = row.find("a", "imagnet")["href"]
                verified = True if row.find("a", "iverify") else False
                trusted = True if row.find("img", {"alt": "verified"}) else False
                seeders = int(row.find_all("td")[-2].text)
                leechers = int(row.find_all("td")[-1].text)
                _ = link, _id, verified, trusted, seeders, leechers
            except (AttributeError, TypeError):
                continue

            print title
Пример #25
0
    def test_search():  # pylint: disable=too-many-locals
        """
        Test searching
        """
        url = 'http://kickass.to/'
        search_url = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc'

        html = getURL(search_url, session=make_session(), returns='text')
        if not html:
            return

        soup = BeautifulSoup(html, 'html5lib')

        torrent_table = soup.find('table', attrs={'class': 'data'})
        torrent_rows = torrent_table.find_all('tr') if torrent_table else []

        # cleanup memory
        soup.clear(True)

        # Continue only if one Release is found
        if len(torrent_rows) < 2:
            print "The data returned does not contain any torrents"
            return

        for row in torrent_rows[1:]:
            try:
                link = urlparse.urljoin(url, (row.find('div', {
                    'class': 'torrentname'
                }).find_all('a')[1])['href'])
                _id = row.get('id')[-7:]
                title = (row.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \
                    or (row.find('div', {'class': 'torrentname'}).find_all('a')[2]).text
                url = row.find('a', 'imagnet')['href']
                verified = True if row.find('a', 'iverify') else False
                trusted = True if row.find('img',
                                           {'alt': 'verified'}) else False
                seeders = int(row.find_all('td')[-2].text)
                leechers = int(row.find_all('td')[-1].text)
                _ = link, _id, verified, trusted, seeders, leechers
            except (AttributeError, TypeError):
                continue

            print title
def rasparUSP(pag, nArq):
    # Abertura em modo de escrita do arquivo para super salarios com base no Valor Mensal
    arq = open('USP-ValorMensal' + str(nArq) + '.txt', 'w')

    # For para percorrer todas 895 paginas com os servidores USP
    for j in range(pag, min(896, pag + 100)):
        # Criação da "sopa" com os dados da pagina atual do poral de transparencia da USP
        html = urlopen(
            'https://uspdigital.usp.br/portaltransparencia/portaltransparenciaListar?paginar=s&dtainictc=01%2F12%2F2016&nompes=&nomundorg=&nomdepset=&tipcon=&tipcla=&nomabvfnc=&Submit=Solicitar+pesquisa&reload=buscar&imagem=S&print=true&chars=21ni&pag='
            + str(j))
        bsObj = BeautifulSoup(html, "html.parser")

        # Obtenção dos elementos da tabela com os dados dos servidores
        tabela = bsObj.find("table", {"class": "table_list"})
        # Obtenção dos dados dos elementos da tabela
        tds = tabela.findAll("td")

        # Valor de teto salarial do estado de SP
        teto = 21631.05
        # Vetores para modelar os dados de interesse
        textoVM = []

        # For para percorrer todos os dados excluindo o cabeçalho
        for i in range(14, len(tds), 14):
            # Caso o Valor Mensal seja superior ao teto adiciona os dados do servidor ao vetor
            if (float(tds[i + 12].getText().strip().replace('.', '').replace(
                    ',', '.')) >= teto):
                textoVM.append('Nome:' + tds[i].getText() + '\n')
                textoVM.append('Instituto:' + tds[i + 2].getText() + '\n')
                textoVM.append('Função:' + tds[i + 8].getText() + '\n')
                textoVM.append('Salário:' + tds[i + 12].getText() + '\n')
                textoVM.append('\n\n')

        # Escreve no respectivo arquivo os dados formatados do servidor com super salario
        arq.writelines(textoVM)
        # Limpa a "sopa"
        bsObj.clear()
        # Exibe qual pagina foi raspada para controle do usuario
        print(j)

    # Fechamento dos arquivos
    arq.close()
Пример #27
0
 def __class_finder(self, session_code):
     self.session.post(OpenClassSearcher.URL, data=self.clg_trm_dict)
     self.cls_details_dict["class_session"] = session_code
     soup = BeautifulSoup(
         self.session.post(OpenClassSearcher.URL,
                           data=self.cls_details_dict).content,
         'html.parser')
     results = soup.find_all("td", {"class": "cunylite_LEVEL3GRIDROW"})
     i = 0
     for elem in results:
         val = elem.text.strip()
         if match("^\\d+$", val) and self.class_num_5_digit == int(val):
             self.found = True
             if elem.find_next("img")["title"] == "Open":
                 self.status = True
                 self.session.close()
             break
         i = i + 1
     soup.clear(decompose=True)
     return self
Пример #28
0
def get_citys(url_path, catogry):
    try:
        catgory_request = requests.get(catogry)
        catgory_soup = BeautifulSoup(catgory_request.text)
        # 读取城市标签元素
        city_eles = catgory_soup.find_all('div', class_='col-xs-10')
        for city_ele in city_eles:
            city_eles = city_ele.find_all('a')
            for city_ele in city_eles:
                # 城市标签中的href值只是完整标签的后半段,需要进行拼接
                city_url = url_path + city_ele.get('href')
                if city_url.find('北京') == -1:
                    continue
                print(city_url)
                get_pages(url_path, city_url)
        # 完成后清理对象
        catgory_soup.clear()
        catgory_request.close()
    except Exception as e:
        print(e)
Пример #29
0
    def test_search(self):
        self.url = 'http://kickass.to/'
        searchURL = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc'

        html = getURL(searchURL, session=requests.Session())
        if not html:
            return

        soup = BeautifulSoup(html, features=["html5lib", "permissive"])

        torrent_table = soup.find('table', attrs={'class': 'data'})
        torrent_rows = torrent_table.find_all('tr') if torrent_table else []

        # cleanup memory
        soup.clear(True)

        #Continue only if one Release is found
        if len(torrent_rows) < 2:
            print(u"The data returned does not contain any torrents")
            return

        for tr in torrent_rows[1:]:

            try:
                link = urlparse.urljoin(self.url,
                                        (tr.find('div', {
                                            'class': 'torrentname'
                                        }).find_all('a')[1])['href'])
                id = tr.get('id')[-7:]
                title = (tr.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \
                    or (tr.find('div', {'class': 'torrentname'}).find_all('a')[2]).text
                url = tr.find('a', 'imagnet')['href']
                verified = True if tr.find('a', 'iverify') else False
                trusted = True if tr.find('img',
                                          {'alt': 'verified'}) else False
                seeders = int(tr.find_all('td')[-2].text)
                leechers = int(tr.find_all('td')[-1].text)
            except (AttributeError, TypeError):
                continue

            print title
Пример #30
0
    def test_search():  # pylint: disable=too-many-locals
        """
        Test searching
        """
        url = 'http://kickass.to/'
        search_url = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc'

        html = getURL(search_url, session=requests.Session(), returns='text')
        if not html:
            return

        soup = BeautifulSoup(html, 'html5lib')

        torrent_table = soup.find('table', attrs={'class': 'data'})
        torrent_rows = torrent_table.find_all('tr') if torrent_table else []

        # cleanup memory
        soup.clear(True)

        # Continue only if one Release is found
        if len(torrent_rows) < 2:
            print "The data returned does not contain any torrents"
            return

        for row in torrent_rows[1:]:
            try:
                link = urlparse.urljoin(url, (row.find('div', {'class': 'torrentname'}).find_all('a')[1])['href'])
                _id = row.get('id')[-7:]
                title = (row.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \
                    or (row.find('div', {'class': 'torrentname'}).find_all('a')[2]).text
                url = row.find('a', 'imagnet')['href']
                verified = True if row.find('a', 'iverify') else False
                trusted = True if row.find('img', {'alt': 'verified'}) else False
                seeders = int(row.find_all('td')[-2].text)
                leechers = int(row.find_all('td')[-1].text)
                _ = link, _id, verified, trusted, seeders, leechers
            except (AttributeError, TypeError):
                continue

            print title
Пример #31
0
class BS4Parser:
    def __init__(self, *args, **kwargs):
        # list type param of "feature" arg is not currently correctly tested by bs4 (r353)
        # so for now, adjust param to provide possible values until the issue is addressed
        kwargs_new = {}
        for k, v in kwargs.items():
            if 'features' in k and isinstance(v, list):
                v = [
                    item for item in v if item in
                    ['html5lib', 'html.parser', 'html', 'lxml', 'xml']
                ][0]

            kwargs_new[k] = v

        self.soup = BeautifulSoup(*args, **kwargs_new)

    def __enter__(self):
        return self.soup

    def __exit__(self, exc_ty, exc_val, tb):
        self.soup.clear(True)
        self.soup = None
Пример #32
0
    def test_search(self):
        self.url = 'http://kickass.to/'
        searchURL = 'http://kickass.to/usearch/American%20Dad%21%20S08%20-S08E%20category%3Atv/?field=seeders&sorder=desc'

        html = getURL(searchURL, session=requests.Session())
        if not html:
            return

        soup = BeautifulSoup(html, features=["html5lib", "permissive"])

        torrent_table = soup.find('table', attrs={'class': 'data'})
        torrent_rows = torrent_table.find_all('tr') if torrent_table else []

        # cleanup memory
        soup.clear(True)

        #Continue only if one Release is found
        if len(torrent_rows) < 2:
            print(u"The data returned does not contain any torrents")
            return

        for tr in torrent_rows[1:]:

            try:
                link = urlparse.urljoin(self.url, (tr.find('div', {'class': 'torrentname'}).find_all('a')[1])['href'])
                id = tr.get('id')[-7:]
                title = (tr.find('div', {'class': 'torrentname'}).find_all('a')[1]).text \
                    or (tr.find('div', {'class': 'torrentname'}).find_all('a')[2]).text
                url = tr.find('a', 'imagnet')['href']
                verified = True if tr.find('a', 'iverify') else False
                trusted = True if tr.find('img', {'alt': 'verified'}) else False
                seeders = int(tr.find_all('td')[-2].text)
                leechers = int(tr.find_all('td')[-1].text)
            except (AttributeError, TypeError):
                continue

            print title
Пример #33
0
def extract(html, start, end):
    """Extract a snippet out of an HTML document.

    Locations are computed over UTF-8 bytes, and doesn't count HTML tags.

    Extraction is aware of tags, so:

    >>> '<p><u>Hello</u> there <i>World</i></p>'[17:27]
    'here <i>Wo'
    >>> extract('<p><u>Hello</u> there <i>World</i></p>', 7, 14)
    '<p>here <i>Wo</i></p>'
    """
    soup = BeautifulSoup(html, 'html5lib')

    # Trim the right side first, because that doesn't mess our start position
    if end is not None:
        e = find_pos(soup, end, False)
        e[0].replace_with(NavigableString(split_utf8(e[0].string, e[1])[0]))
        delete_right(soup, e[2])

    # Trim the left side
    if start is not None:
        s = find_pos(soup, start, True)
        s[0].replace_with(NavigableString(split_utf8(s[0].string, s[1])[1]))
        delete_left(soup, s[2])

    # Remove everything but body
    body = soup.body
    soup.clear()
    soup.append(body)

    # Remove the body tag itself to only have the contents
    soup.body.unwrap()

    # Back to text
    return str(soup)
Пример #34
0
class BS4Parser:
    def __init__(self, *args, **kwargs):
        # list type param of "feature" arg is not currently correctly tested by bs4 (r353)
        # so for now, adjust param to provide possible values until the issue is addressed
        kwargs_new = {}
        for k, v in kwargs.items():
            if 'features' in k and isinstance(v, list):
                v = [item for item in v if item in ['html5lib', 'html.parser', 'html', 'lxml', 'xml']][0]

            kwargs_new[k] = v

        tag, attr = [x in kwargs_new and kwargs_new.pop(x) or y for (x, y) in [('tag', 'table'), ('attr', '')]]
        if attr:
            args = (re.sub(r'(?is).*(<%(tag)s[^>]+%(attr)s[^>]*>.*</%(tag)s>).*' % {'tag': tag, 'attr': attr},
                           r'<html><head></head><body>\1</body></html>', args[0]).strip(),) + args[1:]

        self.soup = BeautifulSoup(*args, **kwargs_new)

    def __enter__(self):
        return self.soup

    def __exit__(self, exc_ty, exc_val, tb):
        self.soup.clear(True)
        self.soup = None
Пример #35
0
def convertHTML2PDF(html_files):

    # Get sorted list of html files from Web Documentation folder
    #ordered_html_files = filterAndSortHTMLfiles(html_files)
    ordered_html_files = html_files

    # Add Header
    createHeader()
    #header_text = createHeader()
    #outputfile.write(header_text)

    # Add Table of Contents
    outline_text = createOutline(html_files)
    outputfile.write(outline_text)

    outputfile.write("""<div class="my-fh-wrapper"> """)

    ChapterNum = ''
    ChapterNum1 = 1
    ChapterNum2 = 0
    ChapterNum3 = 0
    ChapterNum4 = 0

    # After header add following texts of files to prepared large html file
    for file in ordered_html_files:

        print(file)

        ####
        level = 1

        if file == 'MESHFREE.html':
            level = 1
            prev_splitted_file = file.split(".")
        else:
            #print(prev_splitted_file)

            splitted_file = file.split(".")
            #print(splitted_file)

            num_level = min(len(prev_splitted_file), len(splitted_file))

            for lev in range(num_level):
                if splitted_file[lev] != prev_splitted_file[lev]:
                    break
                else:
                    level = level + 1
            """
            if level == 1:
                ChapterNum1=ChapterNum1+1
                ChapterNum2=0
                ChapterNum3=0
                ChapterNum4=0
                ChapterNum = str(ChapterNum1) 
            """

            if level == 2:
                ChapterNum2 = ChapterNum2 + 1
                ChapterNum3 = 0
                ChapterNum4 = 0
                ChapterNum = str(ChapterNum2) + '. '
            if level == 3:
                ChapterNum3 = ChapterNum3 + 1
                ChapterNum4 = 0
                ChapterNum = str(ChapterNum2) + '.' + str(ChapterNum3) + '. '
            if level == 4:
                ChapterNum4 = ChapterNum4 + 1
                ChapterNum = str(ChapterNum2) + '.' + str(
                    ChapterNum3) + '.' + str(ChapterNum4) + '. '

            prev_splitted_file = splitted_file

        if level > 4:
            level = -1

        #if file.split(".")[0] == 'Index':
        #    level=-1

        ####

        file_text = codecs.open(path + '/' + file, 'r')
        html_text = file_text.read()

        soup = BeautifulSoup(html_text, 'lxml')
        div_fhwrapper = soup.find(
            "div",
            {"class": "fh-wrapper"})  # "header", "class": "border-vertical"})

        div_fhwrapper.find('div', {"class": "header"}).decompose()

        # Delete footer
        #if div_fhwrapper.find('div', {"class": "blank-class-outer-top footer"}):
        #    div_fhwrapper.find('div', {"class": "blank-class-outer-top footer"}).decompose()

        # Change Link in DOWNLOAD COMPREHENSIVE EXAMPLE to downlad files in svn
        if div_fhwrapper.find('div',
                              {"class": "blank-class-outer-top footer"}):
            div_footer_download = div_fhwrapper.find(
                'div', {"class": "blank-class-outer-top footer"})
            changeLinksInDIV(div_footer_download)

        if div_fhwrapper.find(
                'div',
            {"class": "blank-class-outer footer visible-md visible-lg"}):
            div_fhwrapper.find(
                'div', {
                    "class": "blank-class-outer footer visible-md visible-lg"
                }).decompose()
        if div_fhwrapper.find('div',
                              {"class": "footer-small visible-xs visible-sm"}):
            div_fhwrapper.find('div', {
                "class": "footer-small visible-xs visible-sm"
            }).decompose()

        # Change style of div "fh-wrapper"
        div_fhwrapper.attrs[
            'style'] = "margin-left: 2.5rem; margin-right: 2.5rem; display: block;"

        # adds # before link, to navigate inside pdf
        changeLinksInDIV(div_fhwrapper)

        #rename_h1_h2_h3_to_p(div_fhwrapper)

        # Delete tables with header text : 'This item referenced in:'
        if div_fhwrapper.find('div', {"class": "blank-class-outer-top"}):
            #div_blank_class_outer_top = div_fhwrapper.find('div', {"class": "blank-class-outer-top"})
            #deleteThisItemReferencedIN(div_blank_class_outer_top)
            for div_blank_class_outer_top in div_fhwrapper.findAll(
                    'div', {"class": "blank-class-outer-top"}):
                # be careful not to change .pdf files links
                deleteThisItemReferencedIN(div_blank_class_outer_top)

        if not div_fhwrapper.find('div', {"class": "blank-class-outer-top"}):
            div_description = soup.find("div", {"class": "description"})
            if str(div_description.text).strip() == "":
                continue

        # Deletes empty <li> tag in this file
        for x in div_fhwrapper.findAll('li'):
            #print(str(x.get_text(strip=True)))
            if len(x.get_text(strip=True)) == 0:
                x.extract()
            for p in x.findAll('p'):
                p.replaceWithChildren()

        # Find div breadcrumbs
        div_breadcrumbs = soup.find("div", {"class": "breadcrumbs"})

        # sets id name to div
        # it is necessary for anchor links
        div_breadcrumbs.attrs['id'] = str(file)

        # Change styles
        div_bordervertical = div_fhwrapper.find("div",
                                                {"class": "border-vertical"})
        if div_bordervertical:
            div_bordervertical.attrs[
                'style'] = 'padding-left: 10px; padding-right: 10px;'

        div_jumbotron = soup.find("div", {"class": "jumbotron"})
        div_jumbotron.attrs[
            'style'] = "padding-top: 5px !important; padding-bottom: 10px !important; margin-bottom: 0px !important;"

        rename_h1_h2_h3_to_p(div_jumbotron, level, ChapterNum)

        # Renames div 'jumbotron' to 'my-jumbotron'
        # That is needed not to inherit many default styles
        for div in soup.find_all('div', class_='jumbotron'):
            pos = div.attrs['class'].index('jumbotron')
            div.attrs['class'][pos] = 'my-jumbotron'

        div_description = soup.find("div", {"class": "description"})
        rename_h1_h2_h3_to_p(div_description)

        for div in div_description.find_all('table', {"id": "customTable"}):
            spanRowsofTable(div)
        """
        Deletes many break lines fefore text
        """
        strip_text(div_description, file)

        resize_img_responsive(div_description)

        # Replaces image to base64 format
        get_image_file_as_base64_data(div_fhwrapper)

        # Renames div 'fh-wrapper' to 'my-fh-wrapper'
        # That is needed not to inherit many default styles
        for div in soup.find_all('div', class_='fh-wrapper'):
            pos = div.attrs['class'].index('fh-wrapper')
            div.attrs['class'][pos] = 'my-fh-wrapper'

        #for div in soup.find_all('div', class_='code'):
        #    pos = div.attrs['class'].index('code')
        #    div.attrs['class'][pos] = 'my-code'

        # Trims break lines and spaces of code at the start and end
        for div in soup.findAll("div", {"class": "note"}):
            divcode = div.find("div", {"class": "code"})
            if divcode:
                divcode.string = divcode.get_text().strip()

        for div in soup.find_all('div', class_='note'):
            pos = div.attrs['class'].index('note')
            div.attrs['class'][pos] = 'my-note'

        if file == 'MESHFREE.InstallationGuide.Execute.CommandLine.html':
            for div in soup.find_all("div", {"class": "my-note"}):
                div.attrs['style'] = 'white-space: pre-wrap;'

        # Write to prepared html file content of div 'my-fh-wrapper'
        div_myfhwrapper = soup.find("div", {"class": "my-fh-wrapper"})
        #content = str(div_myfhwrapper)

        for text in div_myfhwrapper.find_all(recursive=False):
            #print(j)
            #find_all(recursive=False)
            outputfile.write(str(text))

        #outputfile.write(content)

        soup.clear()
        # print(content)

        # Close read file in this iteration
        file_text.close()

    outputfile.write("""</div>""")
    """
    # Add footer to file
    meshfree_file = codecs.open(path + '/MESHFREE.html', 'r')
    meshfree_text = meshfree_file.read()

    meshfree_soup = BeautifulSoup(meshfree_text, 'lxml')
    # Find footer
    div_footer = meshfree_soup.find("div", {"class": "footer-small visible-xs visible-sm"})
    # Change links if necessary
    changeLinksInDIV(div_footer)
    content_footer = str(div_footer)
    meshfree_file.close()
    # write to file
    outputfile.write(content_footer)
    """

    content_footer = """
    <div class="footer-small visible-xs visible-sm">
        <div class="blank-class-outer-bottom">
            <div class="blank-class-inner">
                <div class="row ">
                    <div class="col-md-12">
                        <a href="#MESHFREE.Releases.html" target="_blank">Releases</a>
                    </div>
                </div>
            </div>
        </div>
        <div class="blank-class-outer-bottom">
            <div class="blank-class-inner">
                <div class="row ">
                    <div class="col-md-12">
                        <a href="https://svn.itwm.fraunhofer.de/svn/MESHFREEdocu/Executables/" target="_blank">Executables</a>
                    </div>
                </div>
            </div>
        </div>
        <div class="blank-class-outer-left-right">
            <div class="blank-class-inner">
                <div class="row ">
                    <div class="col-md-12">
                        <a href="http://itwm.fraunhofer.de" target="_blank"> &#169; 2020 Fraunhofer Institute for Industrial Mathematics ITWM</a>
                    </div>
                </div>
            </div>
        </div>
    </div>

    """
    outputfile.write(content_footer)
Пример #36
0
        data = table[row].findChildren(name="td")

        # Getting name and id of the player-card the a element from futbin
        player_a_element = data[0].findChild(name="a", attrs={"class": "player_name_players_table"})
        player_url = player_a_element['href']
        player_id = player_url.split("/")[3]
        player_name = player_a_element.get_text()
        player_data.append(player_id)
        player_data.append(player_name)

        # Getting overall rating of player
        rating = data[1].findChild(name="span").get_text()
        player_data.append(rating)

        # Getting the stats of the player
        # pace: 8, shooting: 9, passing: 10, dribbling: 11, defending: 12, physicality: 13
        pace = 8
        physicality = 13
        for stat_num in range(pace, physicality):
            stat = data[stat_num].findChild(name="span").get_text()
            player_data.append(stat)

    
        player_string = ",".join(player_data)
        file.write(player_string + "\n")

    soup.clear()
    page.close()

file.close()
Пример #37
0
def highlight(html, highlights, show_tags=False):
    """Highlight part of an HTML documents.

    :param highlights: Iterable of (start, end, tags) triples, which are
        computed over UTF-8 bytes and don't count HTML tags
    :param show_tags: Whether to show the tag names within brackets after each
        highlight
    """
    # Build a list of starting points and ending points
    starts = []
    ends = []
    for hl in highlights:
        starts.append((hl[0], 'start', []))
        if len(hl) == 2:
            ends.append((hl[1], 'end', []))
        else:
            ends.append((hl[1], 'end', hl[2]))
    # This relies on the fact that 'end' < 'start'
    events = sorted(ends + starts)

    events = iter(events)
    soup = BeautifulSoup(html, 'html5lib')

    pos = 0
    node = soup
    highlighting = 0
    try:
        event_pos, event_type, tags = next(events)
    except StopIteration:
        event_pos = event_type = tags = None

    while node is not None:
        if getattr(node, 'contents', None):
            # Move down
            node = node.contents[0]
            continue

        if isinstance(node, NavigableString):
            # Move through text
            nb = len(node.string.encode('utf-8'))
            while event_pos is not None:
                if event_pos == pos and event_type == 'start':
                    # Start highlighting at beginning of text node
                    highlighting += 1
                    try:
                        event_pos, event_type, tags = next(events)
                    except StopIteration:
                        event_pos = None
                elif pos + nb > event_pos:
                    # Next event falls inside of this text node
                    if event_type == 'start' and highlighting:
                        # Keep highlighting (can't highlight *more*)
                        highlighting += 1
                    elif (
                        event_type == 'end'
                        and not show_tags
                        and highlighting > 1
                    ):
                        # Keep highlighting (no need to put labels)
                        highlighting -= 1
                    else:  # 'end' and (show_tags or highlighting becomes 0)
                        # Split it
                        char_idx = byte_to_str_index(
                            node.string,
                            event_pos - pos,
                        )
                        left = node.string[:char_idx]
                        right = node.string[char_idx:]

                        # Left part
                        newnode = NavigableString(left)
                        if highlighting:
                            # Optionally highlight left part
                            span = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            span.append(newnode)
                            newnode = span
                        node.replace_with(newnode)
                        node = newnode

                        if event_type == 'start':
                            highlighting += 1
                        else:
                            highlighting -= 1
                            if show_tags:
                                # Add tag labels
                                comment = soup.new_tag(
                                    'span',
                                    attrs={'class': 'taglist'},
                                )
                                comment.string = ' [%s]' % ', '.join(tags)
                                node.insert_after(comment)
                                node = comment

                        # Right part
                        newnode = NavigableString(right)
                        node.insert_after(newnode)
                        node = newnode
                        nb -= event_pos - pos
                        pos = event_pos
                        # Next loop will highlight right part if needed

                    try:
                        event_pos, event_type, tags = next(events)
                    except StopIteration:
                        event_pos = None
                elif highlighting:  # and pos + nb <= event_pos:
                    # Highlight whole text node
                    newnode = soup.new_tag(
                        'span',
                        attrs={'class': 'highlight'},
                    )
                    node.replace_with(newnode)
                    newnode.append(node)
                    node = newnode
                    if pos + nb == event_pos and event_type == 'end':
                        if show_tags:
                            comment = soup.new_tag(
                                'span',
                                attrs={'class': 'taglist'},
                            )
                            comment.string = ' [%s]' % ', '.join(tags)
                            newnode.insert_after(comment)
                            node = comment
                        highlighting -= 1
                        try:
                            event_pos, event_type, tags = next(events)
                        except StopIteration:
                            event_pos = None
                    break
                else:  # not highlighting and pos + nb <= event_pos
                    # Skip whole text node
                    break

            pos += nb

        # Move up until there's a sibling
        while not node.next_sibling and node.parent:
            node = node.parent
        if not node.parent:
            break
        # Move to next node
        node = node.next_sibling

    # Remove everything but body
    body = soup.body
    soup.clear()
    soup.append(body)

    # Remove the body tag itself to only have the contents
    soup.body.unwrap()

    # Back to text
    return str(soup)
Пример #38
0
        if "Киләһе бит" in nav_tag.string:
            next_page_link = HOST_URL + nav_tag.get('href')

    # getting info from found pages
    for link in to_parse:
        webpage = urllib.request.urlopen(link)
        soup = BeautifulSoup(webpage)
        link.encode('utf-8')
        name = soup.find('title').string
        for tag_refl in soup.find_all('ol', class_="references"):
            amount = 0
            for tag_link in tag_refl.contents:
                if tag_link.name == 'li':
                    amount += 1
            results.append((name, link, amount))
        soup.clear()        
        """
        looking through tag_refl.contents
        """
        
# end of reading Wiki

# sorting what we have
results.sort(key=lambda x: -x[2])
results = results[:limit + 1]

# Jinja code interpretating results
result_page = Template(u'''\
<html>
<head><title>Results of searching</title></head>
<body>