Python find_all示例，lxml.html.find_all Python示例

示例#1

0

显示文件

文件： create_data.py 项目： BIDS-collaborative/EDAM

def scrape_pacific_plants_html(link):
  if re.search('http', link) is None:
    link = 'http://www.hear.org/pier/' + link

  r = requests.get(link)
  html = BeautifulSoup(r.text, 'lxml')

  # check Australian table format
  question_index = 0
  answer_index = -1
  aus = False
  try:
    table = html.find_all('table')[1]
  except:
    table = html.find_all('table')[0]
    aus = True
    question_index = 2

  rows = table.find_all('tr')
  header = rows[0]
  rows = rows[1:-1]

  # check for score column
  if parse_html(str(header.find_all('td')[-1])) == 'Score':
    answer_index -= 1

  feature_dict = dict()
  for row in rows:
    data = row.find_all('td')
    # end of Australian table
    if aus and parse_html(str(data[3])) == 'Outcome:':
      break

    # check if feature row
    try:
      feature_id = int(round(float(parse_html(str(data[question_index]))) * 100))
    except:
      continue

    answer = parse_html(str(data[answer_index])).lower()

    if feature_id in EDGE_CASES:
      try:
        feature_dict[feature_id] = int(answer)
      except:
        feature_dict[feature_id] = 'NA'
    else:
      if answer in VAL_LOOKUP:
        feature_dict[feature_id] = VAL_LOOKUP[answer]
      else:
        feature_dict[feature_id] = 'NA'
    
  return feature_dict

示例#2

0

显示文件

def searchGenders( browser : mc.Browser, gender : str , page = 1) -> None:

    url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(page)
    res = browser.open(url)

    html = res.read()
    html = bs4( html, "html.parser" )

    pageCount = html.select("div[max_num_pages]")
    pageCount = pageCount[0].select(":not(:last-child)")

    maxPage = int(pageCount[ len(pageCount) -1 ].get_text())
    linkdata = open("links.html", "a")

    for i in range(maxPage + 1):
        if( i >= 1 ):
            print("===> getting page {}\n".format(i))
            url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(i)
            res = browser.open(url)

            html = res.read()
            html = bs4( html, "html.parser" )           
            booklinks = html.find_all('a', {'class' : 'bookTitle'})

            for link in booklinks:
                linkdata.write( "<a href='"+ str(url) + str( link['href'] ) +"' ></a>\n")
    
    print("Ready!")

示例#3

0

显示文件

def parse_me(rsp):
    html = bs4(rsp.text, 'html.parser')
    apts = html.find_all('li', attrs={'class': 'result-row'})
    results = []

    apt_titles = [
        apt.find('a', attrs={'class', 'hdrlnk'}).text for apt in apts
    ]
    #apt_sizes  = [ re.sub(' +',' ', apt.findAll(attrs ={'class':'housing'})[0].text.replace("\n","").strip()) for apt in apts]
    apt_prices = find_prices(apts)
    apt_times = [apt.find('time')['datetime'] for apt in apts]
    apt_links = [
        apt.find('a', attrs={'class', 'hdrlnk'})['href'] for apt in apts
    ]
    apt_loc = [apt.find('a', attrs={'class', 'hdrlnk'}).text for apt in apts]
    apt_img = [apt.find('img') for apt in apts]
    apt_titles = sanitize(apt_titles)
    #apt_sizes  = sanitize(apt_sizes)
    apt_times = sanitize(apt_times)
    apt_loc = sanitize(apt_loc)

    #create a dataframe to hold all info
    data = np.array(
        [apt_titles, apt_img, apt_prices, apt_times, apt_links, apt_loc])
    col_names = ["Title", "Image", "Price", "Posted On", "link", "Location"]
    pd.set_option('max_colwidth', 2000)
    dataframe = pd.DataFrame(data.T, columns=col_names)
    dataframe.set_index('Title')
    results.append(dataframe)
    results = pd.concat(results, axis=0)

    return results

示例#4

0

显示文件

def listen():

    link_list = []
    link_list_send = []
    send_list = []
    url_cl_base = "http://sfbay.craigslist.org"
    while True:
        rsp = requests.get(url_base, params=params)
        html = bs4(rsp.text, 'html.parser')
        result = parse_me(rsp)
        result.head()
        result.to_html("index.html")
        stylesheet = "style.css"
        js = 'cl.js'
        with open("./index.html", "a") as file:
            file.write("<link rel='stylesheet' type='text/css' href='" +
                       stylesheet + "' >")
            file.write(
                "<script src='https://code.jquery.com/jquery-3.3.1.min.js' integrity='sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=' crossorigin='anonymous'></script>"
            )
            file.write("<script  src='" + js + "'></script>")
        apts = html.find_all('li', attrs={'class': 'result-row'})
        for apt in apts:
            title = apt.find_all('a', attrs={'class': 'hdrlnk'})[0]
            name = ''.join([i for i in title.text])
            link = title.attrs['href']
            cur_price = apt.find('span', {'class': 'result-price'}).text
            cur_price = sanitize(cur_price)
            cur_price = ''.join(cur_price)
            loc = apt.find('span', attrs={'class', 'result-hood'})
            if loc is None:
                loc = ""
            else:
                loc = loc.text

            if link not in link_list and link not in link_list_send:
                #print "found new listing"
                f = open('./home/critter/out.log', 'w')
                print >> f, 'found new listing', url_cl_base, link
                f.close()
                link_list_send.append(link)
                send_list.append(name + '\n -$$ ' + cur_price +
                                 ' $$ - \nLOCATION:: ' + loc + "\n" +
                                 url_cl_base + link)

        if len(link_list_send) > 0:
            #g = creds.Creds();
            #print("sending mail")
            f = open('./home/critter/out.log', 'w')
            #print >> f,'sending mail to:: '+ my_email
            f.close()
            msg = "\n\n".join(send_list)
            # m   = email.message.Message()
            # m.set_payload(msg)
            #g.gmailSend(m,my_email)
            link_list += link_list_send
            link_list_send = []
            send_list = []
        sleep_time = np.random.randint(150, 155)
        time.sleep(sleep_time)

示例#5

0

显示文件

文件： 56听书爬虫.py 项目： ShowMeBaby/MyPythonScript

def search(bookname):
    # 搜索听书
    # 参数 page int 例子: 1
    # 参数 searchword string 56听书的限制搜索必须提交GBK编码的字符串 例子: %D5%C2%D3%E3
    path = '/search.asp?page=1&searchword=' + urllib.request.quote(
        bookname.encode('gb2312'))
    html = getHtmlContent(path)
    searchList = html.find_all(class_='list-ov-tw')
    pageCount = html.find(
        class_='cate-pages').find_all('li')[1].text.split('/')
    nowPageCount = pageCount[0]  # 当前页
    allPageCount = pageCount[1]  # 总页数
    bookList = []  # 搜索结果
    for searchItem in searchList:
        bookUrl = searchItem.find(class_='bt').find('a').attrs['href']
        bookImg = searchItem.find('img').attrs['original']
        bookName = searchItem.find(class_='bt').text
        bookAuthor = searchItem.find_all(
            class_='zz')[0].text + ' ' + searchItem.find_all(
                class_='zz')[1].text
        bookContent = searchItem.find(class_='nr').text
        book = Book(bookUrl, bookImg, bookName, bookAuthor, bookContent,
                    nowPageCount, allPageCount)
        bookList.append(book)
    return bookList

示例#6

0

显示文件

文件： Combined_Script.py 项目： amychen/flightAnalysis

def get_flights(html):
    flights = html.find_all('li', 'flight')
    result = []
    for f in flights:
        info = str(f.find('div', 'bottom'))
        info = info.replace('<div class="bottom">', '')
        info = info.replace("</div>", '')
        result.append(info)
        
    return result

示例#7

0

显示文件

def grab_all_reviews(url, card_id):
    resp = requests.get(url, PARAMETERS)
    code = BeautifulSoup(resp.text.encode('utf-8', 'ignore'), 'html.parser')
    pagination_element = code.find('td', {'class': 'pagination-right'})
    last_page = None
    if pagination_element is None:
        return
    last_page = int(
        code.find('td', {
            'class': 'pagination-right'
        }).find_all('b')[1].text)
    current_page = 0

    while current_page < last_page:
        current_page = current_page + 1
        PARAMETERS['pg'] = current_page
        response = requests.get(url, PARAMETERS)
        html = BeautifulSoup(response.text, 'html5lib')
        reviews = html.find_all('div', {'class': 'review'})
        titles = html.find_all('h5', {'class': 'review-title'})
        review_count = 0
        for review in reviews:

            review_count = review_count + 1
            rating = review.find('span').get('ck-stars').split()[0]
            review_str = str(review)
            start = review_str.find('review-title')
            end = review_str.find('</h5')
            title = ''
            if start != -1 and end != -1:
                title_list = review_str[start:end].split('\n')
                title = title_list[1].strip()

            text = review.find('div', {'class': 'readmoreInner'})
            paragraphs = text.find_all('p')
            review_text = '\n\n'.join(
                [paragraph.text for paragraph in paragraphs])
            print(rating, '\n', title, '\n', review_text, '\n')
            REVIEWS.append(('', title, review_text, rating, card_id))

        print(len(REVIEWS))

        sleep(3)

示例#8

0

显示文件

def trueAltalsUrl(altalsId):
    url = webUrl + '/a/?id=' + str(altalsId)
    try:
        r = http.request('Get', url, headers=headers)
    except IOError:
        print('', end="\r")
    try:
        soup = BeautifulSoup(r.data.decode("utf8"), "lxml")
    except:
        return
    else:
        html = soup.find('div', class_='tuji')
        if not html:
            return
        altalsName = html.find('h1').text.replace('\\',' ').replace('\\',' ').\
                     replace('/',' ').replace(':',' ').replace('*',' ').replace('?',' ').\
                     replace('<',' ').replace('>',' ').replace('|',' ').replace('"',' ')
        p = html.find_all('p')
        orgid = str(p[0].a['href']).replace('/x/?id=', '')
        altalsCount = str(p[2].text.replace('图片数量：', '').replace('P', ''))
        models = []
        classs = []
        grilIds = ''
        classIds = ''
        for m in p[1].find_all('a'):
            h = m['href'].replace('/t/?id=', '')
            models.append(h)
        for c in p[3].find_all('a'):
            h = c['href'].replace('/s/?id=', '')
            classs.append(h)
        for m in models:
            grilIds += m + ','
        for c in classs:
            classIds += c + ','
        grilIds = grilIds.rstrip(',')
        classIds = classIds.rstrip(',')
        if not grilIds:
            grilIds = '-1'
        try:
            saveAltals(altalsId, orgid, classIds, grilIds, altalsName,
                       altalsCount)
            try:
                imgHtml = soup.find('div', id='kbox').find_all('img')
                imgs = []
                for k in imgHtml:
                    imgUrl = k['data-src']
                    imgs.append(imgUrl)
                return imgs
            except IOError:
                return False
        except IOError:
            return False

示例#9

0

显示文件

文件： aabasic.py 项目： davischan3168/newpackage

def get_bussinessanalysis_EM(code):
    """
    核心题材
    --------------------------
    code:为股票代码，为6位数
    """
    if code[0] in ['6', '9']:
        code = 'sh' + code
    if code[0] in ['0', '2', '3']:
        code = 'sz' + code

    url = 'http://f10.eastmoney.com/f10_v2/BusinessAnalysis.aspx?code={0}'.format(
        code)
    #url='http://emweb.securities.eastmoney.com/CoreConception/Index?type=web&code={0}#'.format(code)
    r = requests.get(url, headers=hds())
    text = r.text
    html = BeautifulSoup(r.text, 'lxml')

    try:
        mainbs = html.find('div', attrs={"class": "section first"})
        mainbs = mainbs.text
    except:
        mainbs = None

    try:
        article = html.find('div', attrs={"class": "article"})
        article = article.text
    except:
        article = None

    try:
        ts = html.find_all('table')
        df = pd.DataFrame()

        for t in ts:
            tbl = str(t)
            #tbl=tbl.replace('<td class="tips-fieldnameL" rowspan="8">按产品分类</td>','')
            #tbl=tbl.replace('<td class="tips-fieldnameL" rowspan="3">按地区分类</td>','')
            #tbl=tbl.replace('<td class="tips-fieldnameL" rowspan="5">按行业分类</td>','')
            dd = pd.read_html(tbl)[0]

            df = df.append(dd)
    except:
        df = None

    return mainbs, df, article

示例#10

0

显示文件

文件： STAR-company sector.py 项目： WQD170071/Data-Mining-Project

def get_sectors():
    # set up browser
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)
    browser = webdriver.Chrome(options=chrome_options)
    # browser = webdriver.Chrome() # open web page
    browser.implicitly_wait(10)  # wait for web page to load

    url = 'https://www.thestar.com.my/business/marketwatch/'
    browser.get(url)
    r = browser.page_source
    html = BeautifulSoup(r, 'html.parser')
    # print(html)
    browser.close()

    # sector elements
    htmlPart = html.find(class_=re.compile("stocks"))
    linkPart = [
        x.get_attribute_list('id')
        for x in htmlPart.find_all('a', {"id": True})
    ]
    for i in range(len(linkPart)):
        sector_elements.extend(linkPart[i])
    # print(linkPart)
    # print(sector_elements)
    # print(len(sector_elements))

    # sector_list
    sector = html.find_all('strong')
    for i in sector:
        sector_list.append(i.text.strip(':'))
        # print(i.text)
    # print(sector_list)

    # sector_name_list
    sector_n = html.select('div.text a')
    for i in sector_n:
        sector_name_list.append(i.text)
    # print(sector_name_list)

    return

示例#11

0

显示文件

文件： crawler_frame.py 项目： blinebau/INF141_Assignment2

def extract_next_links(rawDatas):
    outputLinks = list()
    '''
    rawDatas is a list of objs -> [raw_content_obj1, raw_content_obj2, ....]
    Each obj is of type UrlResponse  declared at L28-42 datamodel/search/datamodel.py
    the return of this function should be a list of urls in their absolute form
    Validation of link via is_valid function is done later (see line 42).
    It is not required to remove duplicates that have already been downloaded. 
    The frontier takes care of that.

    Suggested library: lxml
    '''
    #global subdomain_track
    #global outlink_track
    

    for resp in rawDatas:
        url = urlparse(resp.url)
        '''
        if url.netloc not in subdomain_track:
            subdomain_track[url.netloc] = 1
        else:
            subdomain_track[url.netloc] += 1
        if not url.query:
            resp.bad_url = true
        '''
        html = BeautifulSoup(resp.content, "lxml")
        html_links = html.find_all('a')
        for link in html_links:
            if link.get('href') is None:
                continue
            outputLinks.append(link.get('href'))
            '''
            if len(html_links) > len(outlink_track):
                outlink_track = (resp.url, len(html_links))
            '''
    return outputLinks

示例#12

0

显示文件

文件： WebsiteSearch.py 项目： SumGuyV5/BoardGamePriceSearch

 def get_price(self, html, html_class=''):
     span_tags = html.find_all('span', attrs={'span', html_class})
     normal_price = []
     for span_tag in span_tags:
         normal_price.append(str(span_tag.contents[0]).strip())
     return normal_price

示例#13

0

显示文件

文件： Free_Games.py 项目： catafest/Free-Games

def getGame():
    time.sleep(7)
    # Re-get the source so that we can look for any "Continue" buttons
    html = BeautifulSoup(browser.page_source, 'lxml')
    try:
        spans = html.find_all('span')
        # Check for a "Continue" button for the 18+ games. If we find it, click it
        for span in spans:
            if span.get_text().upper() == 'CONTINUE':
                # Get the xpath
                xpath = xpath_soup(span)
                # Use the xpath to grab the browser element (so we can click it)
                geez_dad_im_not_a_kid_anymore = browser.find_element_by_xpath(xpath)
                geez_dad_im_not_a_kid_anymore.click()
                break
        time.sleep(7)
    except:
        print()

    # Get the source again, just incase we came from clicking the potential "Continue" button
    html = BeautifulSoup(browser.page_source, 'lxml')

    # Get all the button tags so we can see whether we need to grab the game or leave
    buttons = html.find_all('button')
    
    for button in buttons:
        if button.get_text().upper() == 'OWNED':
            break
        if button.get_text().upper() == 'GET':
            # Get the xpath of this button
            xpath = xpath_soup(button)
            # Use the xpath to grab the browser element (so we can click it)
            browser_element = browser.find_element_by_xpath(xpath)
            browser_element.click()
            time.sleep(4)

            # Re-get the source (again)
            html = BeautifulSoup(browser.page_source, 'lxml')
            # Get all the spans so we can get the "Place Order" button
            spans = html.find_all('span')

            for span in spans:
                if span.get_text().upper() == 'PLACE ORDER':
                    # Get the xpath
                    xpath = xpath_soup(span)
                    # Use the xpath to grab the browser element (so we can click it)
                    purchase_button_element = browser.find_element_by_xpath(xpath)
                    # Create object and add it to the list
                    purchase_button_element.click()
                    break
            # If the EULA prompt shows up, click it
            try:
                browser.find_element_by_xpath('''//span[contains(text(),'I Agree')]''')
            except:
                print()
            else:
                EU_Refund_and_Right_of_Withdrawal_Information = browser.find_element_by_xpath('''//span[contains(text(),'I Agree')]''')
                EU_Refund_and_Right_of_Withdrawal_Information.click()
                time.sleep(2)
            # We only want the first one (usually the game), so leave
            break

示例#14

0

显示文件

文件： Free_Games.py 项目： catafest/Free-Games

html = BeautifulSoup(browser.page_source, 'lxml')


# Check for, and close, the cookies banner
try:
    browser.find_element_by_xpath('''/html/body/div/div/div[4]/header/div/button/span''')
except:
    print()
else:
    cookies = browser.find_element_by_xpath('''/html/body/div/div/div[4]/header/div/button/span''')
    cookies.click()
    time.sleep(2)
    

# Get all the span tags to make sure we get every available game
spans = html.find_all('span')

# Create a list for all the game dictionaries
games = []
for span in spans:
    if span.get_text().upper() == 'FREE NOW':
        # Get the xpath
        xpath = xpath_soup(span)
        # Use the xpath to grab the browser element (so we can click it)
        browser_element = browser.find_element_by_xpath(xpath)
        # Create object and add it to the list
        games.append({'xpath': xpath,
                      'element': browser_element
                      })

示例#15

0

显示文件

def poi_page_link(source):
    html = BeautifulSoup(source, 'lxml')
    table = html.find_all(class_='table table-bordered table-hover')
    links = table[0].find_all('a')
    links = [x['href'] for x in links]
    return links

示例#16

0

显示文件

def get_americanlife_info(epno,
                          throwException=True,
                          extraStuff=None,
                          verify=True):
    """
    Returns a tuple of title, year given the episode number for This American Life.
    """

    # first see if this episode of this american life exists...
    if extraStuff is None:
        resp = requests.get(
            'http://www.thisamericanlife.org/radio-archives/episode/%d' % epno,
            verify=verify)
    else:
        resp = requests.get(
            'http://www.thisamericanlife.org/radio-archives/episode/%d/%s' %
            (epno, extraStuff),
            verify=verify)
    if resp.status_code != 200:
        raise ValueError(
            "Error, could not find This American Life episode %d, because could not open webpage."
            % epno)

    enc = resp.headers['content-type'].split(';')[-1].split(
        '=')[-1].strip().upper()
    if enc not in ('UTF-8', ):
        html = BeautifulSoup(unicode(resp.text, encoding=enc), 'lxml')
    else:
        html = BeautifulSoup(resp.text, 'lxml')

    elem_info_list = list(
        filter(
            lambda elem: 'class' in elem.attrs and 'top-inner' in elem.attrs[
                'class'] and 'clearfix' in elem.attrs['class'],
            html.find_all('div')))
    if len(elem_info_list) != 1:
        if throwException:
            raise ValueError(" ".join([
                "Error, cannot find date and title for This American Life episode #%d,"
                % epno, "because could not get proper elem from HTML source."
            ]))
        else:
            return None
    elem_info = max(elem_info_list)
    date_list = list(
        filter(
            lambda elem: 'class' in elem.attrs and 'date' in elem.attrs['class'
                                                                        ],
            elem_info.find_all('div')))
    if len(date_list) != 1:
        if throwException:
            raise ValueError(
                "Error, cannot find date and title for This American Life episode #%d."
                % epno)
        else:
            return None
    date_s = max(date_list).text.strip()
    date_act = datetime.datetime.strptime(date_s, '%b %d, %Y').date()
    year = date_act.year

    title_elem_list = list(
        filter(
            lambda elem: 'class' in elem.attrs and 'node-title' in elem.attrs[
                'class'], elem_info.find_all('h1')))
    if len(title_elem_list) != 1:
        raise ValueError(
            "Error, cannot find date and title for This American Life episode #%d."
            % epno)
    title = max(title_elem_list).text.strip()
    title = titlecase.titlecase(':'.join(title.split(':')[1:]).strip())
    return title, year

示例#17

0

显示文件

def parse_object(object):
    repeat = True
    for city_data in cities:
        if (repeat == False):
            break
        url = object
        city = city_data[0]
        city_id = city_data[1]
        city_name = city_data[2]
        alternative_name = color = price_old = name = ram = rom = material = price_current = name_shop = address_shop = None
        url = url + "/shopdirections?cityId=" + city
        r = request(url)
        html = get_html(r)
        log.info("Парсинг %s", url)
        try:
            count_shops = int(
                html.xpath(
                    './/li[@class="c-tabs__menu-item active"]/a/span/text()')
                [0])
            print(count_shops)
        except:
            count_shops = None
        if ((count_shops != None) and (count_shops > 0)):
            span_showcase = html.xpath('.//span[@class="fl-embedded-wrapper"]')
            if (span_showcase != []):
                print("Тута")
                continue
            div_old = html.xpath('.//div[@class="c-pdp-price__old"]')
            if (div_old != []):
                price_old = div_old[0].text
                price_old = ''.join(filter(str.isdecimal, price_old))
                price_old = float(price_old)
                print("Старая цена:", price_old)
            price_current = html.xpath(
                './/div[@class="c-pdp-price__current sel-product-tile-price"]/text()'
            )[0]
            price_current = ''.join(filter(str.isdecimal, price_current))
            price_current = float(price_current)
            print("Текущая цена:", price_current)
            code = html.xpath('.//p[@class="c-product-code"]')[0].text
            print("Код товара:", code)
            brand = html.xpath("//ul[@class='c-breadcrumbs__list']")[0].xpath(
                "li")[-1].xpath("a/span/text()")[0]
            print("Brand", brand)

            #Парсинг параметров объекта
            url_new = url + "/specification?ssb_block=descriptionTabContentBlock&cityId=" + city
            r = request(url_new)
            html = html_bs4(r)

            div_characteristic = html.find_all('div', {
                "class":
                "product-details-tables-holder sel-characteristics-table"
            })[0]
            characteristic_h3 = div_characteristic.find_all("h3")
            characteristic_table = div_characteristic.find_all("table")
            count_parameters = 5
            count_current = 0
            color_repeat = False
            for i, val in enumerate(characteristic_h3):
                if (count_current < count_parameters):
                    # if (count_current < count_parameters):
                    #     if (val.text == "Корпус"):
                    #         tr = characteristic_table[i].tbody.find_all("td")
                    #         for i, value in enumerate(tr):
                    #             if (value.span.get_text().strip() == "Материал корпуса"):
                    #                 material = tr[i + 1].span.get_text().strip()
                    #                 if (material.find("/") != -1):
                    #                     material = material.replace("/", ",")
                    #                 material = material.title()
                    #                 count_current += 1
                    #                 print(material)
                    if (count_current < count_parameters):
                        if (val.text == "Серия модели"):
                            tr = characteristic_table[i].tbody.find_all("td")
                            for i, value in enumerate(tr):
                                if (value.span.get_text().strip() == "Серия"):
                                    name = brand + " " + tr[
                                        i + 1].span.get_text().strip()
                                    count_current += 1
                    if (count_current < count_parameters):
                        if (val.text == "Цвет, размеры и вес"):
                            tr = characteristic_table[i].tbody.find_all("td")
                            for i, value in enumerate(tr):
                                if (value.span.get_text().strip() == "Цвет"):
                                    color = tr[i + 1].span.get_text().strip()
                                    if (color.find("/") != -1):
                                        color_repeat = True
                                    else:
                                        color = color.capitalize()
                                        if (color.find("Золотистый") != -1):
                                            color = "Золотой"
                                        if (color.find("Серый") != -1):
                                            color = "Серый"
                                        count_current += 1
                                        print("Цвет:", color)
                    if (count_current < count_parameters):
                        if (val.text == "Модель"):
                            tr = characteristic_table[i].tbody.find_all("td")
                            for i, value in enumerate(tr):
                                if (value.span.get_text().strip() == "Модель"):
                                    alternative_name = tr[
                                        i + 1].span.get_text().strip()
                                    alternative_name = alternative_name.upper()
                                    count_current += 1
                                    print("Альтернативное имя:",
                                          alternative_name)
                    if (count_current < count_parameters):
                        if (val.text == "Память"):
                            tr = characteristic_table[i].tbody.find_all("td")
                            for i, value in enumerate(tr):
                                if (value.span.get_text().strip() ==
                                        "Оперативная память (RAM)"):
                                    ram = tr[i + 1].span.get_text().strip()
                                    ram = ram.upper()
                                    index_ram = ram.find(" ")
                                    ram = float(ram[:index_ram]) * 1024
                                    count_current += 1
                                    print("RAM:", ram)
                                if (value.span.get_text().strip() ==
                                        "Встроенная память (ROM)"):
                                    rom = tr[i + 1].span.get_text().strip()
                                    rom = rom.upper()
                                    index_rom = rom.find(" ")
                                    rom = float(rom[:index_rom]) * 1024
                                    count_current += 1
                                    print("ROM:", rom)
                    if (count_current < count_parameters):
                        if (val.text == "Память"):
                            tr = characteristic_table[i].tbody.find_all("td")
                            for i, value in enumerate(tr):
                                if (value.span.get_text().strip() ==
                                        "Оперативная память (RAM)"):
                                    ram = tr[i + 1].span.get_text().strip()
                                    ram = ram.upper()
                                    index_ram = ram.find(" ")
                                    ram = float(ram[:index_ram]) * 1024
                                    count_current += 1
                                    print("Ram:", ram)
                                if (value.span.get_text().strip() ==
                                        "Встроенная память (ROM)"):
                                    rom = tr[i + 1].span.get_text().strip()
                                    rom = rom.upper()
                                    index_rom = rom.find(" ")
                                    rom = float(rom[:index_rom]) * 1024
                                    count_current += 1
                                    print("Rom", rom)
                    if (color_repeat):
                        if (count_current < count_parameters):
                            if (val.text == "Служебная информация"):
                                tr = characteristic_table[i].tbody.find_all(
                                    "td")
                                for i, value in enumerate(tr):
                                    if (value.span.get_text().strip() ==
                                            "Базовый цвет"):
                                        color = tr[i +
                                                   1].span.get_text().strip()
                                        if (color.find("/") != -1):
                                            print()
                                            #exit(0)

                                        else:
                                            color = color.capitalize()
                                            if (color.find("Золотистый") !=
                                                    -1):
                                                color = "Золотой"
                                            if (color.find("Серый") != -1):
                                                color = "Серый"
                                            count_current += 1
                                            print("Цвет", color)

                else:
                    break

            # request список магазинов

            log.info(
                "%s - Имя: %s | Цвет: %s | Альтернативное имя: %s | Текущая цена: %s | Старая цена: %s",
                code, name, color, alternative_name, price_current, price_old)
            url_shops = 'https://www.mvideo.ru/sitebuilder/blocks/browse/product-detail/tabs/availableStoresTable.jsp?productId=' + code + '&tab=list&cityId=' + city + '&ajax=true&json=true&take-today=&page=1&viewAll=true'
            #url = 'https://www.mvideo.ru/sitebuilder/blocks/browse/product-detail/tabs/availableStoresTable.jsp?productId=30028814&tab=list&cityId=CityCZ_2246&ajax=true&json=true&take-today=&page=1&viewAll=true'

            r = request_json(url_shops)

            log.info("Начала парсинга магазинов| %s", url)
            html = get_html_json(r["storeList"])

            li = html.cssselect('li.store-locator-list-item')
            for li in li:
                city_id = object[2]
                div_pickup = li.cssselect('div.pickup')[0]
                try:
                    hours = div_pickup.cssselect('p>span')[0].text.strip()
                except IndexError:
                    continue
                try:
                    div_stock = li.cssselect('div.stock>i')[0].get('class')
                except IndexError:
                    continue

                if ((div_stock.find("ico-stock-level-out-of") != -1)
                        or (div_stock.find("ico-stock-level-showcase") != -1)):
                    continue
                if (hours.find("Через час") != -1):
                    div_name_shop = li.cssselect('div.name')[0]
                    name_shop = str(
                        div_name_shop.cssselect('h3>a')[0].text.strip())
                    split_name_shop = name_shop.split(", ")
                    address_shop = str(
                        div_name_shop.cssselect('p')[0].text.strip())
                    if (address_shop.find("обл.") != -1):
                        address_shop = address_shop.replace("обл.", "область")
                    if (address_shop.find("МО") != -1):
                        address_shop = address_shop.replace(
                            "МО", "Московская область")
                    if (address_shop.find("р-н") != -1):
                        address_shop = address_shop.replace("р-н", "район")
                    if (address_shop.find("Всеволжский район") != -1):
                        address_shop = address_shop.replace(
                            "Всеволжский район", "Всеволожский район")
                    print(address_shop)
                    print(name_shop)
                    try:
                        count = li.cssselect('i.ico')[0].get("data-title")
                    except:
                        count = None
                    if ((count != None) and (count != "Привезем под заказ")
                            and (name_shop.find("Пункт выдачи") == -1)
                            and (address_shop.find("Пункт выдачи") == -1)):
                        print("1:", name_shop, "2:", address_shop)

                        #Проверка на уже существования магазина по параметрам
                        company = "М.Видео"
                        sql = """SELECT id FROM shop WHERE name=%s and address=%s and chain_stores_id=%s"""
                        data = [(name_shop, address_shop, 1)]
                        row = select_request_db(sql, data)
                        if (len(row) == 1):
                            shop_id = row[0]
                        else:
                            print("Ошибка")
                            exit(0)

                        print("ID магазина", shop_id)

                        sql = """SELECT product.id from product where product.name=%s and product.color = %s and product.ram = %s and product.rom=%s"""
                        data = [(name, color, ram, rom)]
                        print(data)
                        row = select_request_db(sql, data)
                        print("Ответ", row)
                        if (len(row) == 1):
                            product_id = row[0]
                            print("PRODUCT VARIANTION_ID", product_id)
                            try:
                                sql = """INSERT INTO price(shop_id, product_id, price_current, price_old, status_sales,url) VALUES (%s, %s, %s, %s, %s,%s)"""
                                data = [(shop_id, product_id, price_current,
                                         price_old, 1, url)]
                                date = insert_request_db(sql, data)
                                global count_succes
                                count_succes += 1
                                print("Количество добавленных объектов",
                                      count_succes)
                            except Exception as e:
                                print("Ошибка", e)
                                print("Данные ошибки", data)
                                print("Тута")
                                exit(0)

                        else:
                            global count_lose
                            count_lose = count_lose + len(cities)
                            print("Количество недобавленных объектов",
                                  count_lose)
                            repeat = False
                            break

示例#18

0

显示文件

def create_resort_list():
    '''
    Gets all of the necessary information from the
    resorts from onthesnow.com

    Returns a list of dictionaries
    '''
    page = requests.get(starting_url)
    page.raise_for_status()
    html = bs4.BeautifulSoup(page.text, 'html5lib')
    # creates a dictionary where the key is an id for the reosrt 
    # and the entry in another dictionary containing all of the info
    resort_dictionary = {} 
    # getting the rating of the resorts (out of 5 stars)
    rating_tags = html.find_all('b', class_ = 'rating_small')
    rating_list = []
    for i in rating_tags:
        b_tag = i.find_all('b')
        rating = b_tag[1].text
        rating_list.append(rating)
    # getting the name and the links of the resorts
    resort_links = html.find_all('div', class_ = 'name')
    resort_name_and_link = []
    for resort in resort_links:
        entry = []
        a_tag = resort.find('a')
        # name of the resort
        resort_name = a_tag['title']
        entry.append(resort_name)
        # link to the resort
        resort_url = a_tag['href']
        resort_url = 'http://www.' + limiting_domain + resort_url 

        entry.append(resort_url)

        resort_name_and_link.append(entry)

    # adding the resort name, link, and rating to the dictionary
    for i in range(len(resort_name_and_link)):
        resort = resort_name_and_link[i][0]
        resort_dictionary[i] = create_dictionary()
        resort_dictionary[i]['Resort Name'] = resort

        link = resort_name_and_link[i][1]
        resort_dictionary[i]['link'] = link

        rate = rating_list[i]
        resort_dictionary[i]['Rating'] = rate

    # Looking at all of the individual ski resorts main page and obtaining 
    # general info about each resort such as night skiing, terrain parks, etc.
    for resort in resort_dictionary:
        page = requests.get(resort_dictionary[resort]['link'])
        page.raise_for_status()
        html = bs4.BeautifulSoup(page.text, 'html5lib')
        # info: state
        region_info = html.find('div', class_ = 'resort_header_inner_wrap')
        state = region_info.find('a')
        resort_dictionary[resort]['State'] = state['title']
        # info: num runs, % beginner, % intermediate, % advanced, % expert,
        # num terrain parks, whether they have night skiing
        terrain_table = html.find_all('p')
        terrain_info = []
        for p_tag in terrain_table:
            if p_tag.has_attr('class'):
                entry = []
                if 'label' in p_tag['class']:
                    entry.append(p_tag.text)
                if 'value' in p_tag['class']:
                    entry.append(p_tag.text)

                if entry != []:
                    terrain_info.append(entry)
        # entering the terrain_info into the dictonary
        for i in range(0, len(terrain_info), 2):
            info = terrain_info[i][0]
            if info in resort_dictionary[resort]:
                if info == 'Night Skiing':
                    resort_dictionary[resort]['Night Skiing'] = True
                else:
                    value = terrain_info[i + 1]
                    value = re.search('[0-9]+', value[0]).group()
                    resort_dictionary[resort][info] = value
        # info: open/close dates, elevation, num lifts, lift tickets
        resort_overview = html.find('div', 
                          class_ = 'resort_overview resort_box module')
        td_tags = resort_overview.find_all('td')
        resort_overview_list = []
        for i in td_tags:
            class_name = i.find('span', class_ = 'ovv_t t2')
            if class_name == None:
                resort_overview_list.append(i.text)
        # adding this info (resort_overview_list) to the dictionary    
        open_close = resort_overview_list[0]
        open_close = re.findall('[0-9]+/[\s]*[0-9]+', open_close)
        open_close = open_close[0] + '-' + open_close[1]
        resort_dictionary[resort]['Open/Close Dates'] = open_close
        resort_dictionary[resort]['Elevation'] = resort_overview_list[1]
        resort_dictionary[resort]['Number Lifts'] = resort_overview_list[2]
        if 'N/A' in resort_overview_list[3]:
            resort_dictionary[resort]['Min Ticket Price'] = 'N/A'
            resort_dictionary[resort]['Max Ticket Price'] = 'N/A'
        else: 
            tic_price = resort_overview_list[3]
            tic_price = re.findall('[0-9]+.[0-9]+', tic_price)
            resort_dictionary[resort]['Min Ticket Price'] = tic_price[0]
            resort_dictionary[resort]['Max Ticket Price'] = tic_price[1]  
        # obtaining the average snowfall and adding to dictionary
        important_dates = html.find('div', id = 'resort_impdates')
        important_dates = important_dates.find_all('li')
        for date in important_dates:
            text = date.text
            if 'Average Snowfall' in text:
                average_snowfall = re.search('[0-9]+', text).group()
                resort_dictionary[resort]['Average Snowfall'] = 
                                                        average_snowfall
        # obtaining the town, address, and zip code of the resort
        resort_contact = html.find('div', id = 'resort_contact') 
        address_info = resort_contact.find_all('p')
        address = address_info[1].text
        resort_dictionary[resort]['Address'] = address
        
        city = address_info[2].text
        zip_code = re.search('[0-9]+', city).group()
        # if the zip code starts with a 0
        if len(zip_code) == 4:
            resort_dictionary[resort]['Zip Code'] = '0' + zip_code
        else:
            resort_dictionary[resort]['Zip Code'] = zip_code
        # if a town is available for the resort
        if re.search('[A-Za-z]+', city) != None:
            town = re.search('[A-Za-z]+', city).group()
            resort_dictionary[resort]['City'] = town
        else:
            resort_dictionary[resort]['City'] = ''
        # obtaining the latitude and longitude of the resort
        lat, lon = latitude_and_longitude(resort_dictionary[resort]['Address'],
                resort_dictionary[resort]['City'], 
                resort_dictionary[resort]['State'],
                resort_dictionary[resort]['Zip Code'])
        resort_dictionary[resort]['Latitude'] = lat
        resort_dictionary[resort]['Longitude'] = lon

示例#19

0

显示文件

文件： scraper.py 项目： dodiku/news_scraper

'''---------------
getting homepage content for [urls], and storing relvant articles by [companies] on {articles}
---------------'''
problematic_urls = []
articles = []

for website in urls:
    web_content = requests.get(website)

    if (web_content.status_code == 403):
        problematic_urls.append(website)
        continue

    html = BeautifulSoup(web_content.text, "lxml")

    for link in html.find_all('a'):
        title = link.get_text().strip()
        for c in companies:
            if (c.lower() in title.lower()) and (c.lower() != title.lower()) and ('google+'.lower() != title.lower()):
                obj = {
                'company': c,
                'title': link.get_text().strip().split('\t')[0].split('\n')[0],
                'url': link.get('href').split('?')[0],
                'source': website,
                }
                articles.append(obj)


'''---------------
saving articles from {articles} to csv file
---------------'''