Python BeautifulSoup.pop примеры, bs4.BeautifulSoup.pop Python примеры использования

Пример #1

0

Показать файл

Файл: index.py Проект: jonathanluck/Quinterest-Scraper

def rando():
    #clip off the last semicolon then split up the separate querries
    reqs = request.query_string.decode('utf-8')[:-1].split(";")
    querry = []
    #make a list of querries to quinterest
    for r in reqs:
        querry.append(formatreq(r))
    if(len(querry) > 25):
        querry = querry[:25]
    questions = []
    for q in querry:
        out = get("http://quinterest.org{}".format(q)).text
        out = BeautifulSoup(out, 'html.parser').find_all(attrs={"class":"row"})
        out.pop(0)
        for e in out:
            #insert the query and replace button at the end of the question
            querystr = q[23:]
            querystr = sub('amount=[0-9]+','amount=1',querystr)
            span = BeautifulSoup('<span class="subjTag" style="display:none"></span>').span
            repbutton = BeautifulSoup('<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>').button
            span.string = querystr
            e.div.append(span)
            e.div.append(repbutton)
            questions.append(str(e))
    questions = processQuestions(questions)
    return ("<br>".join(questions))

Пример #2

0

Показать файл

def CleanFile(document):
    regex = re.compile('\\d{2}/\\d{2}/\\d{4},.\\d{2}:\\d{2}')
    CleanStartTime = time.time()
    cleaned = BeautifulSoup(document, "lxml").get_text()
    cleaned = cleaned.split(" ")
    if (len(cleaned) < 290):
        print(
            "Wrong type of file, please choose a facebook messenger history file."
        )
        quit()
    else:
        for i in range(290):
            cleaned.pop(0)
    cleaned = " ".join(cleaned)
    cleanedwregex = re.split(regex, cleaned)
    listofdates = re.findall(regex, cleaned)
    CleanEndTime = time.time()
    print("HTML cleaned in " + str("%.2f" % (CleanEndTime - CleanStartTime)) +
          "seconds")

    PrintStartTime = time.time()
    gucciString = ""
    for i in range(len(cleanedwregex) - 1):
        gucciString += listofdates[i] + " | " + cleanedwregex[i] + "\n"

    print(gucciString)
    PrintEndTime = time.time()
    print("Printed to console in " +
          str("%.2f" % (PrintEndTime - PrintStartTime)) + "seconds")

    return gucciString

Пример #3

0

Показать файл

def rando():
    #clip off the last semicolon then split up the separate querries
    reqs = request.query_string.decode('utf-8')[:-1].split(";")
    querry = []
    #make a list of querries to quinterest
    for r in reqs:
        querry.append(formatreq(r))
    if (len(querry) > 25):
        querry = querry[:25]
    questions = []
    for q in querry:
        out = get("http://quinterest.org{}".format(q)).text
        out = BeautifulSoup(out,
                            'html.parser').find_all(attrs={"class": "row"})
        out.pop(0)
        for e in out:
            #insert the query and replace button at the end of the question
            querystr = q[23:]
            querystr = sub('amount=[0-9]+', 'amount=1', querystr)
            span = BeautifulSoup(
                '<span class="subjTag" style="display:none"></span>').span
            repbutton = BeautifulSoup(
                '<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>'
            ).button
            span.string = querystr
            e.div.append(span)
            e.div.append(repbutton)
            questions.append(str(e))
    questions = processQuestions(questions)
    return ("<br>".join(questions))

Пример #4

0

Показать файл

Файл: downloader.py Проект: zebleck/SheetDownloader

    def getMoodle(self, name, id, paths, scriptDesc, sheetDesc):
        print("Checke %s" % (name))
        
        loginurl = 'https://elearning2.uni-heidelberg.de/login/index.php'
        loginpayload = self.moodlePayload
        url = 'https://elearning2.uni-heidelberg.de/course/view.php?id=%s' % (id)
        scriptPath = paths['scripts']
        excercisePath = paths['sheets']
        miscPath = paths['misc']

        self.s.post(loginurl, loginpayload)

        weeks = BeautifulSoup(self.s.get(url).content, "html.parser").find("ul", {"class": "weeks"}).findAll("div", {"class": "content"})
        weeks.pop(0)
        
        for section in weeks:
            for part in section.findAll("li", {"class": "activity resource modtype_resource "}):
                text = part.find("span", {"class": "instancename"}).text
                if(text == ""):
                    break
                if(text.endswith(" Datei")):
                   text = text[:-6]
                r = self.s.get(part.find("a").get("href"))
                if(text.startswith(scriptDesc)):
                    self.download_pdf(r.url, scriptPath, text)
                elif(text.startswith(sheetDesc)):
                    self.download_pdf(r.url, excercisePath, text)
                else:
                    self.download_pdf(r.url, miscPath, text)

Пример #5

0

Показать файл

Файл: seaceperu.py Проект: ricardocardenas/seacemas

    def parse_items(self, page):
        match = re.findall('</table>(<table.+?<th>Item.+?</table>)', page, flags=re.S)
        if not match:
            return None
        rows = BeautifulSoup(match[0]).find_all('tr')
        rows.pop(0) # first header row
        rows.pop(0) # second header row
        i = ""
        while rows:
            data = rows.pop(0).find_all('td')
            item_number = int(data[0].get_text(strip=True))
            item_description = data[1].get_text(' ', strip=True)
            item_catalog_family = data[2].get_text(' ', strip=True)
            item_quantity = float(data[3].get_text(strip=True).replace(',',''))
            item_uom = data[4].get_text(' ', strip=True)
            try:
                item_unit_price = float(data[5].get_text(strip=True).replace(',',''))
            except:
                item_unit_price = None
            i += item_description + " " + item_catalog_family + "\n"
            self.items.append(dict(item_number=item_number, item_description=item_description,
                item_catalog_family=item_catalog_family, item_quantity=item_quantity,
                item_uom=item_uom, item_unit_price=item_unit_price))
            if rows: # sometimes item tables don't have a hanging last row
                rows.pop(0) # every second row is useless
#        print items
        self.items_text += i

Пример #6

0

Показать файл

Файл: OptionChain.py Проект: Crimsonarcher/stockscrape

def get_options_dates():
    html = get_data('http://www.nseindia.com/live_market/dynaContent/live_watch/fxTracker/optChainDataByExpDates.jsp?symbol=USDINR&instrument=OPTCUR')
    optDates = SoupStrainer('select', {'id': 'expirydate'})
    data = BeautifulSoup(html, 'html.parser', parse_only=optDates).find_all("option")
    data.pop(0)
    dates = []
    for i in data:
        dates.append(i.get_text())
    return json.dumps({'options_dates':dates})

Пример #7

0

Показать файл

Файл: indeed_crawler.py Проект: zachary-chiodini/web-scraping-apps

 def _handle_screening_questions(self,
                                 answer_questions: bool,
                                 collect_q_and_a: bool,
                                 wait=10) -> None:
     for _ in range(10):
         try:
             self._select_resume()
             if collect_q_and_a or answer_questions:
                 questions = BeautifulSoup(
                     self._browser.page_source,
                     'lxml').findAll(class_=compile_regex('Questions'))
                 if questions:
                     questions.pop(0)
                     for div in questions:
                         labels = div.findAll('label')
                         if not labels:
                             self._select_continue(wait)
                             continue
                         question_found = labels.pop(0).get_text()\
                             .replace('(optional)', '').strip()
                         if not question_found:
                             self._select_continue(wait)
                             continue
                         select = div.findAll('select')
                         if select:
                             for element in select:
                                 labels = element.findAll('option')
                                 answers_found = self._get_answers_set(
                                     labels)
                                 if not answers_found:
                                     self._select_continue(wait)
                                     break
                                 if answer_questions:
                                     self._answer_question(
                                         div, question_found, answers_found)
                         else:
                             answers_found = self._get_answers_set(labels)
                             if not answers_found:
                                 self._select_continue(wait)
                             if answer_questions:
                                 self._answer_question(
                                     div, question_found, answers_found)
                         if collect_q_and_a:
                             if question_found in self._q_and_a:
                                 self._q_and_a[question_found].update(
                                     answers_found)
                             else:
                                 self._q_and_a[
                                     question_found] = answers_found
             self._select_continue(wait)
         except TimeoutException:
             break
         except NoSuchElementException:
             print('NoSuchElementException encountered!')
             break
     return None

Пример #8

0

Показать файл

 def all_url(self, url):
     html = requests.get(url)
     all_a = BeautifulSoup(html.text,
                           'lxml').find('div', class_='all').find_all('a')
     all_a.pop(0)
     for a in all_a:
         title = a.get_text()
         print('开始保存:', title)
         path = str(title).replace('?', '_')
         self.mkdir(path)
         href = a['href']
         self.html(href)

Пример #9

0

Показать файл

Файл: decomposer.py Проект: InspectorMustache/hans-legacy

def download_table(table_url):
    """Returns a BeautifulSoup ResultSet with all tables from the WikiCommons
    Hanzi decomposition project. Each table is contained in a <pre> tag."""
    # get a list of all <pre>-elements
    print('Downloading from {}...'.format(table_url), end='')
    decomp_html = download(table_url).text
    print('Done.')
    decomp_soup = BeautifulSoup(decomp_html, 'html.parser').find_all('pre')
    # remove first part that describes the table
    if re.search(r'1\.[^2]+2\.', decomp_soup[0].string, re.DOTALL):
        decomp_soup.pop(0)

    return decomp_soup

Пример #10

0

Показать файл

Файл: meizi_spider.py Проект: Chen2358/Python-example

 def all_url(self, url):
     html = request.get(url, 3)
     # html = Download.get(self, url 3)
     all_a = BeautifulSoup(html.text,
                           'lxml').find('div', class_='all').find_all('a')
     all_a.pop(0)
     for a in all_a:
         title = a.get_text()
         print(u'开始保存: ', title)
         path = str(title), replace("?", '_')
         self.mkdirs(path)
         href = a['href']
         self.html(href)

Пример #11

0

Показать файл

Файл: meizitu.py Проект: fagan2888/whimsical

 def all_url(self, url):
     html = self.request(url)  ##调用request函数把套图地址传进去会返回给我们一个response
     all_a = BeautifulSoup(html.text,
                           'lxml').find('div', class_='all').find_all('a')
     # 页面更改 多了一个早期图片 需要删掉（小伙伴们 可以自己尝试处理一下这个页面）
     all_a.pop(0)
     # 上面是删掉列表的第一个元素
     for a in all_a:
         title = a.get_text()
         print(u'开始保存：', title)  ##加点提示不然太枯燥了
         path = str(title).replace(
             "?", '_')  ##我注意到有个标题带有 ？  这个符号Windows系统是不能创建文件夹的所以要替换掉
         self.mkdir(path)  ##调用mkdir函数创建文件夹！这儿path代表的是标题title哦！！！！！不要糊涂了哦！
         href = a['href']
         self.html(
             href)  ##调用html函数把href参数传递过去！href是啥还记的吧？ 就是套图的地址哦！！不要迷糊了哦！

Пример #12

0

Показать файл

Файл: core.py Проект: kfdm/radio-growl

def upcoming():
    # curl -d ajax=true -d mod=queue http://www.animenfo.com/radio/nowplaying.php
    page = requests.post(API_URL, data={"ajax": "true", "mod": "queue", "togglefull": "true"})
    results = BeautifulSoup(page.text).findAll("tr")

    results.pop()

    songs = []
    for row in results:
        row = "".join(row.findAll(text=True))
        if row.strip() == "":
            continue
        row = BeautifulStoneSoup(row, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
        row = row.__str__().strip()
        row = re.sub("\s+", " ", row)
        songs.append(row)
    return songs

Пример #13

0

Показать файл

Файл: seaceperu.py Проект: ricardocardenas/seacemas

    def parse_actions(self, page):
        match = re.findall('<th>Otras Acciones de la Convocatoria(.*?)</table>', page, flags=re.S)
        if not match:
            return None
        self.action_html = match[0]
        rows = BeautifulSoup(match[0]).find_all('td')
        self.actions = []
        self.last_action = ''
        self.last_action_on = None
        n = 0
        while rows:
            n += 1
            row = rows.pop(0)
            if row.span:
                row.span.extract()
            a = row.u.extract().get_text() if row.u else None
            row_text = row.get_text(' ', strip=True)
            row_text = re.sub('informado el d.a', '', row_text)
            row_text = re.sub(r'el (\d+/\d+/\d+) a las (\d+:\d+)', r'\1 \2', row_text)
            match = re.findall('(.*?)(\d+/\d+/\d\d\d\d \d+:\d+)(.*)', row_text, flags=re.S)
            if match:
                try:
                    d = datetime.datetime.strptime(match[0][1], '%d/%m/%Y %H:%M')
                except ValueError:
                    d = None
                t = (match[0][0].strip(' ,') + ' ' + match[0][2].strip(' ,')).strip()
                if not self.last_action_on or d > self.last_action_on:
                    self.last_action_on = d
                    self.last_action = a + " " + t
                self.actions.append(dict(action_number=n, action_date=d, action_name=a, action_text=t))
            else:
                match = re.findall('(NOTIFICACI.N ELECTR.NICA) (.*)', row_text)
                if match:
                    d = None
                    try:
#                        print "parse_actions(): row.a['onclick'] =", row.a['onclick'], "type", type(row.a['onclick'])
#                        print "parse_actions(): entering match"
                        m = re.search('(\d+).+?(\d+)', row.a['onclick'])
#                        print "parse_actions(): after match"
#                        print "Found notificacion with", m.group(1), "and", m.group(2)
                        uri = SEACE.build_notificacion_uri(m.group(1), m.group(2))
#                        print "parse_actions(): get uri", uri
                        doc = SEACE.get_page(uri)
                        if doc:
                            ds = re.search('(\d+/\d+/\d+ \d+:\d+)', doc)
#                            print "parse_actions(): matched", ds.group(1)
                            if ds:
                                d = datetime.datetime.strptime(ds.group(1), '%d/%m/%Y %H:%M')
                    except:
                        pass
                    a = match[0][0]
                    t = match[0][1]
                    if not self.last_action_on or d > self.last_action_on:
                        self.last_action_on = d
                        self.last_action = a + " " + t
                    self.actions.append(dict(action_number=n, action_date=d, action_name=a, action_text=t))
                else:
                    self.actions.append(dict(action_number=n, action_date=None, action_name=a, action_text=row_text))

Пример #14

0

Показать файл

 def all_url(self, url):
     # html = self.request(url)
     html = request.get(url, 3)
     all_a = BeautifulSoup(html.text, 'lxml').find('div', attrs={'class': 'all'}).find_all('a')
     # 页面上多了一个早期图片，删除了
     all_a.pop(0)
     for a in all_a:
         title = a.get_text()
         print('开始保存: ', title)
         self.title = title
         path = str(title).replace(":", "")
         self.mkdir(path)
         href = a['href']
         self.url = href #将页面地址保存到self.url中
         if self.meizitu_collection.find_one({'主题页面': href}):
             print(u'这个页面已经爬取过了')
         else:
             self.html(href)

Пример #15

0

Показать файл

    def list_of_study(self)->List[List[str]]:
        '''Projectについての情報を取得する
        以下の形式で出力
        --------------------------------------
        SRPxxxxxx
        PRJNAxxxxxxx
        Short Introduction about this project!
        Abstract: about this project!
        --------------------------------------

        Returns
        -------
        List[List[str]] 
            それぞれのProjectの情報は上記の形式の文字列として管理
            され、この返却値のリストの一つの要素として構成される。

            補足：
            このリストにはself._bio_pjt_urlで検索された全てのProject
            の情報を管理する。
         
        '''

        ht = requests.get(self._bio_pjt_url)
        soup = BeautifulSoup(ht.content,"html.parser").find_all("tr")
        soup.pop(0) #skip header

        srp = ""   #type:str
        bpjid = "" #type:str
        title = "" #type:str
        abst = ""  #type:str
        list = []  #List[List[str]]
        
        for i in soup:
            title  = (i.find_all("td")[2]).text.strip()
            href = i.find("a").get("href") #href = "?study=SRP~"
            srp = href.split('=')[1]    
            bpj = BioProject(srp)       #type:BioProject       
            abst = bpj.abstract()            
            bpjid = bpj.bioproject_id() 
            list.append([srp,bpjid,title,abst])

        return list

Пример #16

0

Показать файл

Файл: bot.py Проект: epedroni/mensaBot

def get_uzh_menu():
    # UZH URL actually has the weekday in german
    locale.setlocale(locale.LC_ALL, "de_CH.utf-8")
    curr_day = str(calendar.day_name[(int(NOW.strftime("%w")) + 6) %
                                     7]).lower()

    if is_lunchtime():
        url = "http://www.mensa.uzh.ch/de/menueplaene/zentrum-mensa/{}.html"
    else:
        url = "http://www.mensa.uzh.ch/de/menueplaene/zentrum-mercato-abend/{}.html"
    r = requests.get(url.format(curr_day))

    if not UZH_MENSA_NOMEAL_STR in r.text:
        return "*Cheap mensa:*\nNo UZH menu available for this day!\n\n"

    menu_div = BeautifulSoup(r.text,
                             "html.parser").findAll("div",
                                                    {"class": "text-basics"})
    menu_div.pop(0)

    return "*Cheap mensa:*\n" + uzh_parse_table(menu_div)

Пример #17

0

Показать файл

def get_recipe_list_by_ids(ids):
    """ return a list of dict objects, each a drink recipe with its details"""

    recipes = []

    for drink_id in ids:
        recipe = urlopen(id_lookup_page + str(drink_id))
        recipe = BeautifulSoup(recipe, "html5lib")
        recipe = loads(recipe.body.string)['drinks'][0]

        print("scraped recipe for drink: " + recipe['strDrink'])

        # remove unnecessary keys
        recipe.pop('strVideo')
        recipe.pop('strIBA')
        recipe.pop("strGlass")
        # iterate over a shallow copy because you can't delete while iterating
        for key, val in list(recipe.items()):
            if ("Ingredient" in key or "Measure"
                    in key) and recipe[key] == "":  # delete empty entries
                del recipe[key]

        recipes.append(recipe)

    return recipes

Пример #18

0

Показать файл

Файл: get_submits.py Проект: shikharbhardwaj/dotfiles

def get_submits(problem_code, contest_code, page_no="1"):
    # Returns the submissions as a dict with the relevant data-id as the key
    site_root = "https://www.codechef.com"
    try:
        print_info("Trying connection")
        raw = urllib.request.urlopen(site_root +
                            "/ssubmission/prob?page=" + page_no + 
                                            "&pcode=" + problem_code +
                                            "&ccode=" + contest_code).read()
    except urllib.error.HTTPError as e:
        #We retry forever in case of 503
        print_error("HTTPError encountered with status code " + str(e.code))
        if e.code == 503:
            return get_submits(problem_code, contest_code, page_no)
        else:
            print_error("""Check problem_code, contest_code or your internet connection""")
            return {}

    print_info("Connection successful")
    raw = raw.decode('utf-8')
    submission_html = json.loads(raw)["content"]
    submit_list = BeautifulSoup(submission_html, "html.parser").findAll("tr")
    submit_list.pop(0)
    final_data = []
    for sub in submit_list:
        sub_dict = {}
        sub_tag = sub.findAll("td")
        if len(sub_tag) != 5:
             print_error("""Check problem_code, contest_code or your internet connection""")
             break
        sub_dict["user"]        = sub_tag[0].text
        sub_dict["score/time"]  = sub_tag[1].text
        sub_dict["mem"]         = sub_tag[2].text
        sub_dict["lang"]        = sub_tag[3].text
        sub_dict["solution"]    = site_root + sub_tag[4].findAll("a")[0]["href"]
        final_data.append(sub_dict)
    return final_data

Пример #19

0

Показать файл

Файл: kassir_funzone_parser.py Проект: goddad/KassirParser

def renew_proxy_list():
    proxies_page = requests.get('https://free-proxy-list.net/').text
    trs_list = BeautifulSoup(proxies_page, 'lxml').find(
        'table', id='proxylisttable').find('tbody').find_all('tr')
    proxies_list = ""

    for i in range(10):
        ip = trs_list.pop(0).find('td')
        port = ip.find_next_sibling()
        proxy = 'http://' + ip.text + ':' + port.text + '\n'

        proxies_list += proxy

    proxies_list = proxies_list[:-2]

    with open('proxies.txt', 'w') as proxies:
        proxies.write(proxies_list)
        proxies.close()

Пример #20

0

Показать файл

Файл: seaceperu.py Проект: ricardocardenas/seacemas

    def parse_calendar(self, page):
#        print "Parsing calendar...",
        self.events = []
        match = re.findall('colspan=3>Calendario\s+(.+?)</table>', page, flags=re.S)
        if not match:
#            print "not found."
            return False
        rows = BeautifulSoup(match[0]).find_all('tr')
        headers = rows.pop(0).find_all('th')
        headers.pop(0) # no use for 1st column 1st row.
        for r in rows:
            rowdata = r.find_all('td')
            tipo = rowdata.pop(0).string # 1st column of row contains event type
            for h in headers:
                hito = re.sub("Fecha ", "", h.string)
                match = re.search('(?P<fecha>\d+/\d+/\d+ \d+:\d+)', rowdata.pop(0).string)
                fecha = datetime.datetime.strptime(match.group('fecha'), "%d/%m/%Y %H:%M") if match else None
                self.events.append(dict(tipo=tipo, hito=hito, fecha=fecha))
        return True

Пример #21

0

Показать файл

def main(date):
    dishes = []

    menus = BeautifulSoup(get_website(), 'html5lib').find_all(
        'table',
        class_='Liste')  #Note: Only html5lib can fix this broken html code.
    days = BeautifulSoup(get_website(),
                         'html5lib').find_all('div', class_='KopfLeiste_o')

    if len(menus) != len(days):
        return ['Leider verstehe ich den Speiseplan heute nicht.']

    date = date.replace(datetime.datetime.now().strftime('%d.%m.%Y'), 'Heute')
    date = date.replace((datetime.date.today() +
                         datetime.timedelta(days=1)).strftime('%d.%m.%Y'),
                        'Morgen')

    for day in days:
        m = menus.pop(0)
        if date in day.find('div').string:
            for dish in m.find_all('tr'):
                if dish.find(class_='Speise'):

                    title = dish.find('td', class_='Speise').text.strip()
                    price = dish.find('td', class_='PreisG').text.strip()

                    #                    annotation = dish.find('td', class_='Nr').text.strip()
                    annotation = MEAT  #default
                    #is it a well-balanced meal and recommended for healthy eating?
                    if str(dish.find(
                            'td',
                            class_='PreisG')).find("apfel-klein.png") != -1:
                        annotation = "%s%s" % (annotation, WELLBALANCEDMEAL)

                    this_dish = '%s %s: *%s*' % (annotation, title, price)
                    dishes.append(this_dish)

    return beautify_menu(dishes) or [
        'Leider kenne ich keinen Speiseplan für diesen Tag.'
    ]

Пример #22

0

Показать файл

Файл: xinsa1.py Проект: ting198791/zhanjiang_realty

    def get_corp_name(self, response):
        # print(response.text)
        tables = BeautifulSoup(response.text,
                               'lxml').find_all("table",
                                                background='images/zj08.gif')
        tr = tables.pop()

        # for td in tds:

        teams = tr.find_all("tr")
        for team in teams:
            # print(team)
            houseDeal = team.find_all("td")
            # for deal in houseDeal:
            #  print(deal.text)
            item = Xinsa1Item()
            item['corp_name'] = houseDeal[0].text.strip()
            item['book_num'] = houseDeal[1].text
            item['order_num'] = houseDeal[2].text
            item['amount'] = houseDeal[3].text
            item['ts'] = houseDeal[4].text
            print(item)
            yield item

Пример #23

0

Показать файл

Файл: _Cleanner.py Проект: KrishnnaOUT/web_crawling

def getDom(pageurl, charset):

    if charset is None:
        charset = 'utf-8'

    soup = BeautifulSoup(pageurl, 'html.parser', from_encoding=charset)

    #去除特定的head、script、style、img、input标签
    [
        body.extract()
        for body in soup(['head', 'img', 'script', 'style', 'input'])
    ]  #

    #去除注释
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()
        pass

    #将soup中的文本提取出来，并存储到body数组中
    soup = soup.text.strip().lstrip().rstrip().split()

    #获取当前日期和具体时间，以便提出soup数组中出现的无效数据（当前系统时间）
    currentDate = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    #处理日期格式，例：2017-04-20---->2017-4-20
    currentDate1 = currentDate[0:5] + currentDate[6:]
    #获取当前具体时间，以便提出soup数组中出现的无效数据（当前系统时间）
    currentTime = time.strftime('%H:%M', time.localtime(time.time()))
    #处理日期格式，例：23:58---->23:5,防止因为程序运行而导致的时间误差
    currentTime1 = currentTime[0:4]

    #剔除soup数组中无效字符串，减少干扰
    #剔除soup数组中'copyright'后半部分的版权内容
    #剔除soup数组中无效的年月。如"1999",剔除“2001-2007”格式的时间字符串
    #剔除soup数组中类似于”最后登录：2017-04-20 23：:55“的无效时间
    #剔除soup数组中出现的当前系统时间字符串
    re0 = re.compile(r'.*Copyright.*')
    re1 = re.compile(r'.*((19\d{2}\D)|(\d{4}-\d{4}\D)).*')
    re2 = re.compile(r'(^|.*)注册.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}$')
    re3 = re.compile(r'.*(' + currentDate1 + '|' + currentDate + ').*')
    re4 = re.compile(r'.*' + currentTime1 + '\d.*')
    re5 = re.compile(r'^最后.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}.*')
    for item in soup:

        #剔除soup数组中以":"、“：”结尾的文本字符串
        if item.endswith("："):
            soup.pop(soup.index(item))
            pass
        if item.endswith(":"):
            soup.pop(soup.index(item))
            pass
        #剔除soup数组中"|"和“»”和“›”文本字符串
        if '|' in soup:
            soup.pop(soup.index('|'))
            pass
        if '>' in soup:
            soup.pop(soup.index('>'))
            pass
        if '»' in soup:
            soup.pop(soup.index('»'))
            pass
        if '›' in soup:
            soup.pop(soup.index('›'))
            pass
        if re0.match(item):
            CopyrightIndex = soup.index(item) - 5
            while CopyrightIndex <= len(soup) - 1:
                popItem = soup.pop(CopyrightIndex)
                pass
            pass
        #剔除不规则时间
        if re1.match(item):
            #获得主体内容中re1指定格式匹配到的文本所在的下标timeIndex
            timeIndex = soup.index(item)
            #,剔除不规则不正常时间数据，防止被重复遍历
            soup.pop(timeIndex)
            pass

        if re2.match(item):
            #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex
            timeIndex = soup.index(item)
            #,剔除不规则不正常时间数据，并让该下标下的值置为'|'，防止被重复遍历
            soup.pop(timeIndex)
        pass
        #剔除当前系统时间
        if re3.match(item):
            if item in soup:
                if re4.match(soup[soup.index(item) + 1]):
                    #剔除当前系统时间
                    soup.pop(soup.index(item) + 1)
                    #去除当前系统日期
                    soup.pop(soup.index(item))
                    pass
                pass
            pass
        if re5.match(item):
            if item in soup:
                #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex
                timeIndex = soup.index(item)
                #,剔除不规则不正常时间数据，并让该下标下的值置为'|'，防止被重复遍历
                soup.pop(timeIndex)
                pass
            pass
        pass
    #返回经过数据预处理的soup数组
    return soup
    pass

Пример #24

0

Показать файл

Файл: mass_data_update.py Проект: kdewey13/Bazaar_Of_Wonders

                    'User-Agent': 'Mozilla/5.0',
                    'Authorization': "Bearer {0}".format(token)
                }).text
        except Exception:
            continue
    # Creates a BeautifulSoup object with the retrieved HTML, then does find to get result set
    listings = BeautifulSoup(
        response,
        'html.parser',
        parse_only=SoupStrainer("script",
                                attrs={'type':
                                       'text/javascript'})).find_all("script")

    if listings:
        product_listings = []
        listings.pop(0)
        for listing in listings:
            try:
                result = listing.contents[0].split('\r\n')
                this_listing = {}
                # the string manipulation of these items assumes standard format where the desired item appears after a colon
                # and is formatted as "<desired item>", html unescape takes care of escape sequences, however since the
                # content is in a string format it leaves behind the leading \\, so this also assumes that no strings will
                # purposefully have a \\ in them, and removes all instances of \\ from strings
                for item in result:
                    if item.find('"set_name":') > 0:
                        this_listing['set_name'] = html.unescape(
                            item.strip().split(':')[1].strip()[1:-2]).replace(
                                '\\', '')
                    elif item.find('"price":') > 0:
                        this_listing['price'] = float(

Пример #25

0

Показать файл

        driver.switch_to.alert.accept()
    except:
        continue

ActionChains(driver).move_by_offset(680, 290).click().perform()
ActionChains(driver).move_by_offset(740, 500).click().perform()
driver.get(
    driver.find_element_by_xpath(
        r'//*[@id="headDiv"]/ul/li[5]/ul/li[6]/a').get_attribute("href"))
driver.find_element_by_xpath(r'/html/body/h2/a').click()
Select(driver.find_element_by_xpath(r'//*[@id="ddlXN"]')).select_by_value(
    "2018-2019")
soup = BeautifulSoup(driver.page_source, "html.parser")
tbody = soup.find_all("tbody")
raw_items = BeautifulSoup(str(tbody[0]), "html.parser").find_all("tr")
raw_items.pop(0)
with open("ahu.json", "w", encoding="utf-8") as f:
    for item in raw_items:
        sp = BeautifulSoup(str(item), "html.parser")
        list = sp.find_all("td")
        course_code = str(list[1])
        course_code = course_code[4:11]
        course_name = str(list[2])
        course_name = course_name[findSubstring(course_name, ">", 2) + 1:-9]
        credit = str(list[6])
        credit = credit[findSubstring(credit, ">", 1) + 1:-5]
        teaching_time = str(list[8]).replace('\n', '').replace(' ', '')
        teaching_time = teaching_time[teaching_time.find("周"):-12]
        f.write(r'{"course code":"' + course_code + r'","course name":"' + course_name + r'","credit":"' \
                + credit + r'","teaching time":"' + teaching_time + r'"}' + '\n')

Пример #26

0

Показать файл

Файл: rewrite.py Проект: Ricotan-naboon/private-discord-chaos-bot

async def on_message(message):
    if message.content.startswith("!cwk"):
        channel = client.get_channel(message.channel)
        global number
        try:
            message.content = int(message.content[4:])
        except ValueError:
            await message.channel.send("数値ではないので回数変更できませんっ！")
            return
        if message.content >= int(1001):
            await message.channel.send("1000以上の数値をやらせようとしないでくださいっ！")
            return
        else:
            number = message.content
            await message.channel.send("叩き起こすメンションの回数を" + str(number) +
                                       "に変更しましたっ！")
            return
    if message.content.startswith("!awk"):
        if "モデレーターさん" in [
                users_role.name for users_role in message.author.roles
        ]:
            conf_on()
            await message.channel.send("Wakeup可能ですっ！")
    if message.content.startswith("!dwk"):
        if "モデレーターさん" in [
                users_role.name for users_role in message.author.roles
        ]:
            conf_off()
            await message.channel.send("Wakeup無効ですっ！")
    if message.content.startswith("!whichwk"):
        if "on" in wu:
            await message.channel.send("Wakeup可能ですっ！")
        else:
            await message.channel.send("Wakeup無効ですっ！")
    if message.content.startswith('whoami'):
        channel = client.get_channel(message.channel)
        llip = ([
            l for l in (
                [
                    ip
                    for ip in socket.gethostbyname_ex(socket.gethostname())[2]
                    if not ip.startswith("127.")
                ][:1],
                [[(s.connect(('8.8.8.8', 53)), s.getsockname()[0], s.close())
                  for s in [socket.socket(socket.AF_INET, socket.SOCK_DGRAM)]
                  ][0][1]]) if l
        ][0][0])
        await message.channel.send("私は" + socket.gethostname() + "だよっ☆" +
                                   "\n" + "ローカルipは" + llip + "だよっ☆")
    if message.content.startswith("廃人"):
        channel = client.get_channel(message.channel)
        res = requests.get('https://status.slack.com/')
        soup = BeautifulSoup(res.text, 'html5lib')
        c = soup.find_all('p', class_="bold")
        s = soup.find_all('p', class_="tiny")
        c = c[1:4]
        s = s[5:8]
        d = str(c.pop(0)) + "\n" + str(s.pop(0))
        d = re.sub(r'<a(.+)</a>', "", d)
        d = re.sub(r'</p>', "", d)
        d = re.sub(r'<p class=\"tiny\">', "", d)
        d = d.replace("  ", "                        ")
        d = re.sub(r'<p class=\"bold\">', "", d)
        await message.channel.send(d)
        for i in range(2):
            d = rep(c, s)
            await message.channel.send(d)
    if message.content.startswith('オールデリート'):
        if "モデレーターさん" in [
                users_role.name for users_role in message.author.roles
        ]:
            id = "<@366844805470486528>"
            await message.channel.send(id + "宛。" + "緊急終了実行。")
            await message.channel.send("実行:" + "<@" + str(message.author.id) +
                                       ">")
            await client.logout()
            os.kill(os.getpid(), 11)
    if message.content.startswith("ipcall"):
        channel = client.get_channel(message.channel)
        res = requests.get("http://inet-ip.info/ip")
        await message.channel.send(res.text)
    if message.content.startswith("プロセスを殺す"):
        channel = client.get_channel(message.channel)
        id = "<@366844805470486528>"
        await message.channel.send(id + "_" + "要請によりプロセスを緊急終了します。")
        await message.channel.send("実行:" + "<@" + str(message.author.id) + ">")
        await client.logout()
        os.kill(os.getpid(), 11)
    if message.content.startswith("!whichfa"):
        await message.channel.send(fa)
    if message.content.startswith("!afa"):
        if "モデレーターさん" in [
                users_role.name for users_role in message.author.roles
        ]:
            fa_conf_on()
            await message.channel.send("再起動のブロックを解除しました。")
    if message.content.startswith("!dfa"):
        if "モデレーターさん" in [
                users_role.name for users_role in message.author.roles
        ]:
            fa_conf_off()
            await message.channel.send("再起動をブロックしました。")
    if message.content.startswith("フォースアゲイン"):
        channel = client.get_channel(message.channel)
        if "off" in fa:
            await message.channel.send(
                "作業中につき再起動をブロックしています。botが暴走している場合はモデレーターへメンションしてください。")
            tar = discord.utils.get(message.guild)
            print(tar)
            #dm = await tar.create_dm()
            #try:
            #await dm.send("FA失敗"+"\n"+"実行:"+message.author)
            #except discord.errors.Forbidden:
            #pass
        else:
            adminID = "<@366844805470486528>"
            SecondAdminID = "<@529644095027806208>"
            await message.channel.send(adminID + "\n" + SecondAdminID + "\n" +
                                       "再起動します")
            await message.channel.send("実行:" + "<@" + str(message.author.id) +
                                       ">")
            await client.logout()
            os.system("reboot")
    if message.content.startswith("今日の大空お天気"):
        channel = client.get_channel(message.channel)
        soup = requests.get("https://www.aikatsu.com/onparade/")
        soup = BeautifulSoup(soup.text, 'html5lib')
        check = soup.find_all("div", class_='txt_detail-date')
        if check == []:
            soup = soup.find_all('dd', class_="txt_detail")
            soup = str(soup.pop(0))
            soup = soup.replace('<dd class="txt_detail">',
                                '').replace('</dd>', '')
            print(soup)
            print(type(soup))
        else:
            check = soup.find_all("div", class_='txt_detail-date')
            soup = str(soup.pop(0))
            soup = re.sub(r'<br(.+)</p>', "", soup)
            soup = soup.replace('<p>', '').replace('</p>', '')
        await message.channel.send(soup)
    if message.content.startswith("金沢地方の遅れ"):
        channel = client.get_channel(message.channel)
        soup = requests.get("https://trafficinfo.westjr.co.jp/hokuriku.html")
        soup = BeautifulSoup(soup.text, 'html5lib')
        check = soup.find_all("p", class_='gaiyo')
        if check == []:
            soup = soup.find_all('strong')
            soup = str(soup.pop(0))
            soup = soup.replace('<strong>', '').replace('</strong>', '')
            await message.channel.send(soup)
        else:
            ls = soup.find_all('p', class_='gaiyo')
            while True:
                soup = ls.pop(0)
                soup = str(soup)
                soup = soup.replace('<p class="gaiyo">',
                                    '').replace('<br/>',
                                                '').replace('</p>', '')
                await message.channel.send(soup)
                if ls == []:
                    break
    if message.content.startswith("近畿地方の遅れ"):
        channel = client.get_channel(message.channel)
        soup = requests.get("https://trafficinfo.westjr.co.jp/kinki.html")
        soup = BeautifulSoup(soup.text, 'html5lib')
        check = soup.find_all("p", class_='gaiyo')
        if check == []:
            soup = soup.find_all('strong')
            soup = str(soup.pop(0))
            soup = soup.replace('<strong>', '').replace('</strong>', '')
            await message.channel.send(soup)
        else:
            ls = soup.find_all('p', class_='gaiyo')
            while True:
                soup = ls.pop(0)
                soup = str(soup)
                soup = soup.replace('<p class="gaiyo">',
                                    '').replace('<br/>',
                                                '').replace('</p>', '')
                await message.channel.send(soup)
                if ls == []:
                    break
    if message.content.startswith("プロセス把握"):
        channel = client.get_channel(message.channel)
        global response
        response = str(subprocess.check_output(['ps', "aux"]))
        print(response)
        response.replace(' ', '\n')
        response = response[:2000]
        await message.channel.send(response)
    if message.content.startswith("neofetch"):
        channel = client.get_channel(message.channel)
        responce = str(subprocess.check_output(["neofetch"]))
        responce = str(responce[:2000])
        await message.channel.send(responce)
    if message.content.startswith("naboon_chat"):
        channel = client.get_channel(message.channel)
        ID = "<@714406627603644489>"
        response = subprocess.check_output(
            ['ojichat', "なぼ"]).decode(encoding='utf-8').rstrip()
        response = ID + response
        print(response)
        await message.channel.send(response)
    if not len(message.attachments) == 0:
        if message.author.bot == True:
            return
        RN = None
        channel = client.get_channel(message.channel)
        await message.channel.send('受け付けました')
        filename = message.attachments[0].filename
        download_img(message.attachments[0].url, filename)
        file_path = filename
        read = decode(Image.open(file_path))
        try:
            path = read[0][0].decode('utf-8', 'ignore')
        except IndexError:
            await message.channel.send("Error! QRコードが検出されませんでした。")
            os.remove(filename)
            return
        print(path)
        print(type(path))
        if path is None:
            await message.channel.send("Error! QRコードが検出されませんでした。")
            os.remove(filename)
            return
        print(path)
        if "http://dcd.sc/n2" in path:
            target_url = path
            r = requests.get(target_url)
            soup = BeautifulSoup(r.text, 'html5lib')
            try:
                NR = soup.find("dd",
                               class_="cardNum").get_text() + " " + soup.find(
                                   "dd", class_="cardName").get_text()
                print(NR)
            except AttributeError:
                RN = "カード名取得失敗です。学生証を読み込んだ事またはリダイレクトの設定間違えだと思われます。"
        elif "http://dcd.sc/j2" in path:
            target_url = path
            r = requests.get(target_url)
            soup = BeautifulSoup(r.text, 'html5lib')
            try:
                NR = soup.find(
                    "div", class_="dress-detail-title clearfix").get_text()
                print(NR)
            except AttributeError:
                RN = "カード名取得失敗です。学生証を読み込んだ事またはリダイレクトの設定間違えだと思われます。"
        elif "http://dcd.sc/n3" in path or "http://dcd.sc/n1" in path:
            NR = "学生証です。"
            print(NR)
        elif "http://dcd.sc/n0" in path:
            NR = "アイドルカードまたはフルコーデカードです。"
            print(NR)
        path = get_shortenURL(path)
        print(path)
        if path == "error":
            await message.channel.send(
                "Error! おそらくKyashなどのアプリ内でのみ使えるQRを送信しようとしていませんか？")
            os.remove(filename)
            path = card = r = None
            return
        else:
            await message.channel.send(NR)
            await message.channel.send(path)
            os.remove(filename)
            path = card = r = None
    if message.content.startswith('pid'):
        channel = client.get_channel(message.channel)
        await message.channel.send(pid)
    if message.content.startswith('!kill'):
        if str(pid) in message.content:
            if message.author.bot:
                return
            else:
                id = "<@366844805470486528>"
                await message.channel.send(id + "宛。" + "緊急終了実行。")
                await message.channel.send("実行:" + "<@" +
                                           str(message.author.id) + ">")
                await client.logout()
                os.kill(os.getpid(), 11)
        else:
            return
    if message.content.startswith("wakeup"):
        channel = client.get_channel(message.channel)
        for mem in message.mentions:
            for i in range(int(number)):
                a = int(mem.id)
                print(a)
                await message.channel.send("<@" + str(a) + ">" + "さん起きて！！！")
    if message.content.startswith("ski"):
        o = []
        lis = []
        ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)" \
             "AppleWebKit/537.36 (KHTML, like Gecko)" \
             "Chrome/60.0.3112.113"
        TU = "https://www.happo-one.jp/gelande/lift/"

        O = requests.get(TU, headers={"User-Agent": ua})
        RE = BeautifulSoup(O.text, "html.parser")
        c = RE.find_all("td")
        for d in c:
            o.append(d.text)
        for i in range(20):
            for i in range(5):
                if o == []:
                    break
                a = o.pop(0)
                b = o.pop(0)
                c = o.pop(0)
                d = o.pop(0)
                e = o.pop(0)
                A = a + " " + b + " " + c + " " + d + " " + e
                A = A.replace("last lift up", "")
                A = A.replace("last lift down", "")
                A = A.replace("        ", "  ")
                A = A.replace("  ", " ")
                lis.append(A)
        await message.channel.send("[リフト名][運転開始時刻][運転終了時刻][備考]")
        for lis in lis:
            await message.channel.send(lis)

Пример #27

0

Показать файл

Файл: AngelScrapper.py Проект: DarshanVaishnav/AngelScrapper

    tree = BeautifulSoup(i['html'], "lxml")
    good_html = tree.prettify()

    title = []
    elements = BeautifulSoup(good_html,
                             "html.parser").find_all(class_="startup-link")
    for m, v in enumerate(elements):
        if m % 2 == 0:
            title.append((elements[m]['title']).encode('utf-8'))

    location = []
    elements = BeautifulSoup(
        good_html,
        "html.parser").find_all(class_="column hidden_column location")
    if k == 0:
        elements.pop(0)

    for i in elements:

        newEle = i.find_all(class_="value")

        for j in newEle:
            newEle2 = j.find_all(class_="tag")
            if len(newEle2) == 0:
                location.append("-")
            else:
                location.append(((newEle2[0].text).encode('utf-8')).strip())

    market = []
    elements = BeautifulSoup(
        good_html,

Пример #28

0

Показать файл

 #print(list2_of_course)
 #print(type(list2_of_course))
 listofcourse = list()
 list_of_course = list_of_course.find_all("option")
 #convert beautifulsoup results to readable list
 while len(list_of_course) > 0:
     #if list_of_course[0].value=="":
     #print(list_of_course[0].string)
     stringtemp = list_of_course[0].string
     if stringtemp.find("20") == -1 and stringtemp.find("---") == -1:
         stringtemp = stringtemp.replace("\n", "")
         if stringtemp != "" and len(stringtemp) > 1:
             listofcourse.append(stringtemp)
     #else:
     #print(stringtemp)
     list_of_course.pop(0)
 print(listofcourse)
 print(len(listofcourse))
 a = 0
 for i in range(len(listofcourse)):
     print(i)
     #if browser.is_element_present_by_text(listofcourse[i]) :
     print(listofcourse[i])
     browser.find_option_by_text(listofcourse[i]).first.click()
     browser.find_by_value('Load Class Schedule').first.click()
     #browser.windows.current=browser.windows[1]
     while len(browser.windows) > 1:
         for ii in browser.windows:
             if ii.url == "https://wish.wis.ntu.edu.sg/webexe/owa/AUS_SCHEDULE.main_display1":
                 browser.windows.current = ii
                 html_page = browser.html

Пример #29

0

Показать файл

import os
import pandas
import datetime
import requests
from bs4 import BeautifulSoup

Response = requests.get("http://www.boxofficemojo.com/yearly/")
RowData = BeautifulSoup(Response.text, "lxml").find(
    "table", attrs={"cellspacing": "1"}).find_all("tr")
ColumnName = RowData.pop(0)
ColumnName = [Item.text for Item in ColumnName]
RowData = [list(Item.stripped_strings) for Item in RowData]
DataFrame = pandas.DataFrame(RowData, columns=ColumnName)
PathResult = os.path.abspath("Results")
if not os.path.exists(PathResult):
    os.makedirs(PathResult)
FileName = os.path.join(PathResult, "boxofficemojo.csv")
DataFrame.to_csv(FileName, index=False)
TextTarget("Save csv to {}".format(FileName))


after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1)
TextTarget("The date after one week - {}".format(after_one_week.strftime("%Y/%m/%d")))

DataForm = {"StartStation": "977abb69-413a-4ccf-a109-0272c24fd490", "EndStation": "9c5ac6ca-ec89-48f8-aab0-41b738cb1814",
            "SearchDate": after_one_week.strftime("%Y/%m/%d"), "SearchTime": "14:00", "SearchWay": "DepartureInMandarin", "RestTime": "", "EarlyOrLater": ""}

Response = requests.post(
    "https://www.thsrc.com.tw/tw/TimeTable/SearchResult", data=DataForm)

RowData = BeautifulSoup(Response.text, "lxml").table.find_all(

Пример #30

0

Показать файл

Файл: processStock.py Проект: BodomLake/stock2

def processStock(broswer, url, stockCode, stockName):
    # 开始请求的计时
    time_request = time.time()
    # 获取页面
    broswer.get(url)
    print(stockName, '[', stockCode, ']', ':Url-Request Time Cost', time.time() - time_request, 's')
    # 开始计时
    time_start = time.time()
    # 切换iframe 以保证准确定位
    try:
        broswer.switch_to.frame('dataifm')
    except Exception as ex:
        print(ex)
        return {};
    # 获取财务报表所有的表的类型
    sideNav = broswer.find_elements_by_xpath('//*[@class="newTab"]/li')

    # 要插入mongodb的大对象（对应一个股票的所有财报）
    insertData = {
        "name": stockName,
        "code": stockCode,
        "tables": [],
    }

    # 循环每一张财报；从主要指标表开始
    for report_type in range(0, len(sideNav)):

        # 当前报表名称
        report_statement_type = sideNav[report_type].text
        print('Current Finance Report Statement:', report_statement_type)

        # 导航栏中的当前链接
        sheet_href = sideNav[report_type].find_element_by_tag_name('a')

        html = BeautifulSoup(broswer.page_source, 'lxml')
        left_div = html.select(".left_thead")
        # 各项指标
        indicators = BeautifulSoup(str(left_div), 'lxml').select('th')
        indicators.pop(0)
        indicatorsNum = len(indicators)
        # for indicator in indicators:
        # print(indicator.text)

        # 数据表格<table>
        # 包含 报告期
        # 包含报告期下的各个财务数据
        right_div = html.select('.data_tbody')
        data_and_periods = BeautifulSoup(str(right_div), 'lxml')
        periods_table = data_and_periods.select('.top_thead')
        periods = BeautifulSoup(str(periods_table[0]), 'lxml').find_all('div', class_='td_w')

        # 数据表格
        data_table = data_and_periods.select('.tbody')
        # 整个表格的数据
        dataGrid = BeautifulSoup(str(data_table), 'lxml').select('tr')

        # 处理数据，存入数据库
        data = []
        # 处理每一行
        for y in range(0, len(indicators)):
            for x in range(0, len(periods)):
                # 当前表格单元的文字
                cell_text = (dataGrid[y].contents[x].text).strip()
                # 默认单位是：''
                unit = ''
                # 默认值是小数：0.0
                value = 0.0
                if cell_text != '--' and cell_text != '':
                    lastChar = list(cell_text)[-1]
                    if lastChar == '亿' or lastChar == '万' or lastChar == '%':
                        unit = lastChar
                        lastTwoChars = cell_text[len(cell_text) - 2:len(cell_text)]
                        if lastTwoChars == '万亿' or lastTwoChars == '千亿':
                            value = float(cell_text[0:-2])
                            unit = lastTwoChars
                        else:
                            value = float(cell_text[0:-1])
                obj = {
                    "period": periods[x].text,
                    "indicator": indicators[y].text,
                    "text": cell_text,
                    "value": value,
                    "unit": unit
                }
                data.append(obj)
        # 插入当前一列

        # 要插入mongodb 的报表对象
        table = {
            "name": report_statement_type,
            "indicators": list(map(lambda x: x.text, indicators)),
            "periods": list(map(lambda x: x.text, periods)),
            "data": data
        }
        # 向大对象中加入当前报表数据
        insertData['tables'].append(table)

        # 默认table还没有刷新
        refreshFlag = False

        # 试图切换表格
        try:
            # 如果不是最后一页，就切换表格！
            if report_type < len(sideNav) - 1:
                # 点击导航栏中的下一个链接
                sideNav[report_type + 1].find_element_by_tag_name('a').click()
                time.sleep(0.1)
                while not refreshFlag:
                    new_left_div = BeautifulSoup(broswer.page_source, 'lxml').select(".left_thead")
                    nextIndicatorSum = BeautifulSoup(str(new_left_div), 'lxml').select('th')
                    refreshFlag = indicatorsNum != len(nextIndicatorSum)
        except Exception as ex:
            print('出现问题了:', ex)
        finally:
            print(sheet_href.text, '输出完毕')

    # 打印结束信息
    print(stockName, '[', stockCode, ']', ':Data-Handle Time Cost', time.time() - time_start, 's')
    print("=======================================")
    return insertData;

Пример #31

0

Показать файл

    HREF = tag.get('href')

    if HREF != "/html/web.config" and HREF != "/":
        yearLinks.append(str(HREF))

for year in yearLinks:

    publicationYear = year.split('/')[2]

    if not os.path.exists("output/" + publicationYear):
        os.makedirs("output/" + publicationYear)

    months = BeautifulSoup(
        urllib2.urlopen(archiveRoot + year).read(), "html5lib").find_all('a')
    months.pop(0)

    for month in months:

        publicationMonth = month.get_text()

        if not os.path.exists("output/" + publicationYear + "/" +
                              publicationMonth):
            os.makedirs("output/" + publicationYear + "/" + publicationMonth)

        days = BeautifulSoup(
            urllib2.urlopen(archiveRoot + month.get('href')).read(),
            "html5lib").find_all('a')
        days.pop(0)

        for day in days:

Пример #32

0

Показать файл

     for url in quchong:
         print(url)
         xiazai_jiumei(url)
 urls = []
 conn = pymysql.connect(host='192.168.0.131',user='******',passwd='123456',db='mypydb',charset='utf8')
 cur = conn.cursor()
 cur.execute("select url from jiumei")
 results = cur.fetchall()
 cur.close()
 conn.close()
 result = list(results)
 for r in result:
     urls.append("%s"%r)
 urls = list(set(urls))
 while urls:
     url = urls.pop()
     print("重新下载:%s"%url)
     xiazai_jiumei_sql(url)
     try:
         conn = pymysql.connect(host='192.168.0.131',user='******',passwd='123456',db='mypydb',charset='utf8')
         cur = conn.cursor()
         cur.execute("select url from jiumeim")
         results = cur.fetchall()
         cur.execute("truncate jiumeim")
         cur.close()
         conn.close()
         result = list(results)
         for r in result:
             urls.append("%s"%r)
         urls = list(set(urls))
     except:

Пример #33

0

Показать файл

Файл: crawler.py Проект: HoekR/emlo_analysis

 def parse_results_rows(self, results_rows: BeautifulSoup) -> List[EMLODoc]:
     """Input is a list of HTML table rows with EMLO results, output is a list of EMLODoc objects."""
     header_row = results_rows.pop(0)
     headers = parse_results_header(header_row)
     return [self.make_emlo_doc(results_row, headers) for results_row in results_rows]

Пример #34

0

Показать файл

Файл: tiler.py Проект: ftlabs/archive-tiling

	
	HREF = tag.get('href')
	
	if HREF != "/html/web.config" and HREF != "/":
		yearLinks.append( str(HREF) )


for year in yearLinks:
	
	publicationYear = year.split('/')[2]
	
	if not os.path.exists("output/" + publicationYear):
		os.makedirs("output/" + publicationYear)
		
	months = BeautifulSoup(urllib2.urlopen(archiveRoot + year).read(), "html5lib").find_all('a')
	months.pop(0)
	
	for month in months:
		
		publicationMonth = month.get_text()
	
		if not os.path.exists("output/" + publicationYear + "/" + publicationMonth):
			os.makedirs("output/" + publicationYear + "/" + publicationMonth)
			
		days = BeautifulSoup(urllib2.urlopen(archiveRoot + month.get('href')).read(), "html5lib").find_all('a')
		days.pop(0)
		
		for day in days:
			
			publicationDay = day.get_text()

Пример #35

0

Показать файл

    def compile_paragraph(cls, paragraph, rels_soup, stringset, is_rtl=False):
        text_elements = paragraph.find_all(cls.TEXT_ELEMENT_TAG)
        if not text_elements:
            return

        txid = paragraph.attrs.get('txid')

        if not txid:
            return

        if stringset.get(txid, None) is None:
            return

        translation_string = stringset[txid].string
        escaped_translation_string = cls._escape_xml(translation_string)

        translation_soup = BeautifulSoup(
            u'<wrapper>{}</wrapper>'.format(escaped_translation_string),
            'xml',
        ).find_all(text=True)

        added_hl_text_elements = defaultdict(list)
        deleted_hl_text_elements = defaultdict(list)
        empty_text_element = None
        elements_for_removal = []
        last_element = None

        leading_spaces = 0

        # First of all try to replace each element translation
        # this is the happiest path
        if is_rtl:
            cls.set_rtl_orientation(paragraph)

        for index, text_element in enumerate(text_elements):
            text = six.text_type(text_element.text)

            # detect text elements that contain no text
            # and remove leading whitespace from the next string
            if not text.strip():
                leading_spaces = len(text) - len(text.strip())
                empty_text_element = text_element
                continue

            last_element = text_element

            hyperlink_url = cls.get_hyperlink_url(text_element, rels_soup)

            # the text parts of the translation are less that the
            # text parts of the document, so we will just remove
            # any exceeding part from the document
            if len(translation_soup) == 0:
                elements_for_removal.append(text_element)
                continue
            else:
                translation_part = translation_soup.pop(0)
                translation = six.text_type(translation_part)
                translation_hyperlink_url = cls.get_translation_hyperlink(
                    translation_part)

                if not translation[:leading_spaces].strip():
                    translation = translation[leading_spaces:]
                    leading_spaces = 0
                else:
                    if empty_text_element:
                        elements_for_removal.append(empty_text_element)
                        empty_text_element = None

                text_element.clear()
                text_element.insert(0, translation)

            # Edit in place hyperlink url
            if hyperlink_url and translation_hyperlink_url:
                cls.set_hyperlink_url(text_element, rels_soup,
                                      translation_hyperlink_url)
            else:
                if hyperlink_url:
                    deleted_hl_text_elements[hyperlink_url]\
                        .append(text_element)
                elif translation_hyperlink_url:
                    added_hl_text_elements[translation_hyperlink_url]\
                        .append(text_element)

        # the text parts of the translation are more that the
        # text parts of the document, so we will compress the
        # remaining translation parts into one string
        if len(translation_soup) > 0:
            translation = last_element.contents[0] + \
                          "".join([six.text_type(t) for t in translation_soup]
            )
            last_element.clear()
            last_element.insert(0, translation)

        if len(added_hl_text_elements) == len(deleted_hl_text_elements)\
                and len(added_hl_text_elements) > 0:
            cls.swap_hyperlink_elements(added_hl_text_elements,
                                        deleted_hl_text_elements)

        for text_elements in six.itervalues(deleted_hl_text_elements):
            for text_element in text_elements:
                cls.remove_hyperlink(text_element)

        for url, text_elements in six.iteritems(added_hl_text_elements):
            for text_element in text_elements:
                cls.create_hyperlink_url(text_element, rels_soup, url)

        for element in elements_for_removal:
            cls.remove_text_element(element)

Пример #36

0

Показать файл

Файл: docx.py Проект: isabella232/openformats

    def compile(self, template, stringset, **kwargs):
        stringset = {string.string_hash: string for string in stringset}
        docx = DocxFile(template)
        soup = BeautifulSoup(docx.get_document(), 'xml')
        rels_soup = BeautifulSoup(docx.get_document_rels(), 'xml')

        for paragraph in soup.find_all('w:p'):
            text_elements = paragraph.find_all('w:t')
            if not text_elements:
                continue

            txid = paragraph.attrs.get('txid')

            if not txid:
                continue

            if stringset.get(txid, None) is None:
                continue

            translation = stringset[txid].string

            translation_soup = BeautifulSoup(
                u'<wrapper>{}</wrapper>'.format(translation),
                'xml').find_all(text=True)

            leading_spaces = 0

            for index, text_element in enumerate(text_elements):
                text = six.text_type(text_element.text)
                # detect text elements that contain no text
                # and remove leading whitespace from the next string
                if not text.strip():
                    leading_spaces = len(text) - len(text.strip())
                    continue
                else:
                    hyperlink_url = self.get_hyperlink_url(
                        text_element, rels_soup)
                    # the text parts of the translation are less that the
                    # text parts of the document, so we will just remove
                    # any excessing part from the document
                    if len(translation_soup) == 0:
                        if hyperlink_url:
                            text_element.find_parent('w:hyperlink').decompose()
                        else:
                            text_element.decompose()
                        continue
                    translation_part = translation_soup.pop(0)
                    translation = six.text_type(translation_part)
                    if not translation[:leading_spaces].strip():
                        translation = translation[leading_spaces:]
                    leading_spaces = 0

                # the text parts of the translation are more that the
                # text parts of the document, so we will compress the
                # remaining translation parts into one string
                if index == len(text_elements) - 1 and len(
                        translation_soup) > 0:
                    translation = "".join(
                        [translation] +
                        [six.text_type(t) for t in translation_soup])

                if hyperlink_url:
                    # attempt to find a parent containing `href` attribute
                    # in order to extract the potential modified url.
                    self.set_hyperlink_url(
                        text_element, rels_soup,
                        getattr(
                            translation_part.find_parent(attrs={'href': True}),
                            'attrs', {}).get('href', hyperlink_url))
                text_element.clear()
                text_element.insert(0, translation)

        docx.set_document(six.text_type(soup))
        docx.set_document_rels(six.text_type(rels_soup))

        result = docx.compress()
        docx.delete()
        return result

Python BeautifulSoup.pop примеры использования