Python bfs示例，bs4.bfs Python示例

示例#1

0

显示文件

def parse_one_page(html):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
        'Upgrade-Insecure-Requests': '1',
        'Referer':
        'http://search.people.com.cn/cnpeople/news/getNewsResult.jsp',
        'Host': 'search.people.com.cn',
        'Origin': 'http://search.people.com.cn',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0'
    }

    requests.adapters.DEFAULT_RETRIES = 5
    s.keep_alive = False
    page = s.get(html).content
    soup = bfs(page, 'lxml')
    data = soup.select('p')
    head = soup.select('h1')
    #print(head.get_text())
    #write_to_file(head.get_text())
    for i in data:
        if i.select('a') == []:
            print(i.get_text())

示例#2

0

显示文件

def get_maoyan_ten_movie(url_name):
    """获取十部电影的链接"""
    response = requests.get(url_name, headers=header)  # headers参数是为了尽量模拟浏览器的功能

    # 验证是否可以爬取到内容
    # print(response.text)
    print(f"返回码：{response.status_code}")

    # 使用beautifulsoup解析内容
    soup = bfs(response.text, 'html.parser')

    # 使用生成器无法迭代？？？
    # for divtag in soup.find_all('div', attrs={'class': 'movie-item-hover'}):
    #     for atag in divtag.find_all('a'):
    #         yield f"https://maoyan.com{atag.get('href')}"

    # ten_movie_url = []
    # for divtag in soup.find_all('div', attrs={'class': 'movie-item-hover'}):
    #     for atag in divtag.find_all('a'):
    #         ten_movie_url.append(f"https://maoyan.com{atag.get('href')}")

    # 使用列表生成器
    ten_movie_url = [
        f"https://maoyan.com{atag.get('href')}" for divtag in soup.find_all(
            'div', attrs={'class': 'movie-item-hover'})[0:10]
        for atag in divtag.find_all('a')
    ]
    # print(ten_movie_url)
    return ten_movie_url

示例#3

0

显示文件

文件：练习6.py 项目： yyyyyn/PyStudyNote

def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org" + articleUrl)
    data = bfs(html, "html.parser")
    title = data.find("h1").get_text()
    content = data.find("div", {"id": "mw-content-text"}).find("p").get_text()
    store(title, content)
    return data.find("div", {"id": "bodyContent"}).findAll("a", href=reMatch)

示例#4

0

显示文件

文件： searcher.py 项目： xcjthu/SearchEngine

def getContent(html):
    global pics

    soup = bfs(html)
    title = soup.h1
    if title is None:
        title = soup.h2
        if title is None:
            title = soup.h3
        if title is None:
            title = soup.h4
        if title is None:
            title = 'None'
        else:
            title = title.text
    else:
        title = title.text
    for script in soup.findAll('script'):
        script.extract()
    for style in soup.findAll('style'):
        style.extract()
    soup.prettify()
    content = soup.get_text().replace('\n',
                                      '').replace('\u3000',
                                                  '').replace('\xa0', '')

    pic_urls = pics.findall(html)

    return content, title, pic_urls

示例#5

0

显示文件

def get_information(url_name):
    for url_value in get_maoyan_ten_movie(url_name):
        # print(f"url地址：{url_value}")
        response_info = requests.get(url_value, headers=header)
        soup_info = bfs(response_info.text, 'html.parser')

        pd_list = []
        for tag in soup_info.find_all('div',
                                      attrs={'class':
                                             'movie-brief-container'}):
            # 电影名称
            movie_name = tag.find('h1', attrs={'class': 'name'}).text
            print(f"电影名称： {movie_name}")
            pd_list.append(f"电影名称:{movie_name}\n")

            movie_tag = tag.find_all('li', attrs={'class': 'ellipsis'})
            # 电影类型
            # print(type(movie_tag[0].text))
            movie_type = " ".join(movie_tag[0].text.split("\n")).strip(" ")
            print(f'电影类型：{movie_type}')
            pd_list.append(f'电影类型：{movie_type}\n')

            # 上映时间
            movie_time = movie_tag[2].text
            print(f'上映时间：{movie_time}')
            pd_list.append(f'上映时间：{movie_time}\r\n')
        with open('./homework1.csv', 'a+', encoding='utf-8') as movieinfo:
            movieinfo.write("".join(pd_list))

示例#6

0

显示文件

文件： quoteGetter.py 项目： skaravind/Quotify

def quotify(word):
    word_list = list(word.split(' '))
    if len(word_list) <= 2:
        pass
    else:
        word = word_list[-1]
    page = requests.get('https://www.brainyquote.com/search_results?q=' + word)
    soup = bfs(page.text, 'html.parser')
    try:
        word = word.lower()
        quotelist = soup.find(id='quotesList')
        quotes = quotelist.find_all(title='view quote')
        authors = quotelist.find_all(title='view author')
        choice = int(len(quotes) * (random.random())) - 1
        quote = quotes[choice].contents[0]
        author = authors[choice].contents[0]
        if len(word_list) <= 2 and (word not in quote):
            return quotify(word_list[-1])
        first = quote[:quote.lower().find(word)]
        keyword = quote[quote.lower().find(word):quote.lower().find(word) +
                        len(word)]
        second = quote[quote.lower().find(word) + len(word):]
        result = [first, keyword, second, author]
        #print('"%s"\n-"%s"'%(result['quote'],result['author']))
    except Exception as e:
        print(str(e))
        result = 0

    return result

示例#7

0

显示文件

    def get_activity_datetime(self, activity_id):
        """
        :param activity_id: String
        :return: datetime object.
        """
        url = "{site}/user/{profile}/activity/{activity_id}".format(
            site=self._runkeeper.site,
            profile=self._runkeeper.profile_username,
            activity_id=activity_id)
        try:
            activity_datetime_session = self.session.get(url)
        except:
            raise EndpointConnectionError

        soup = bfs(activity_datetime_session.text, "html.parser")
        form = soup.find('div', {'class': 'micro-text activitySubTitle'})

        activity_datetime = [
            date_params.split('-')[0].rstrip() for date_params in form
        ]
        activity_datetime = (''.join(activity_datetime))
        activity_datetime = datetime.strptime(activity_datetime,
                                              '%a %b %d %H:%M:%S %Z %Y')

        return activity_datetime

示例#8

0

显示文件

文件： spider.py 项目： swolfod/Sogou_Words

def getHtml(uri):
    updateTime = []
    result = []

    url = HOST + uri

    res = requests.get(url, timeout=TIMEOUT)
    soup = bfs(res.text, "lxml")

    titleRes = soup.find_all(class_="detail_title")
    btnRes = soup.find_all(class_="dict_dl_btn")
    updateRes = soup.find_all(class_="show_content")
    pageNums = soup.find_all(["span", "default"])
    maxPage = 0
    for nums in pageNums:
        if not nums.text.isdigit(): continue
        if int(nums.text) > maxPage:
            maxPage = int(nums.text)

    for upt in updateRes:
        if upt.contents[0].find('-') < 0: continue
        updateTime.append(upt.contents)
    for tit, btn, upt in zip(titleRes, btnRes, updateTime):
        for tc, bc, uc in zip(tit.children, btn.children, upt):
            result.append({"title": tc.contents[0].strip(), "link": bc.attrs["href"], "updateTime": uc})

    return result

示例#9

0

显示文件

文件： _scrapeData.py 项目： 98simens/Virkesborsen-API-scraper

    def __init__(self, url, page, signInUrl, email, password):
        self.url = url
        self.targetPage = urljoin(url, page)

        #Start Session
        self.session = rq.Session()

        #Get page
        self.getRequest = self.session.get(self.targetPage)

        #Check ok response
        self.getRequest.raise_for_status()

        self.bfs = bfs(self.getRequest.text, features='html5lib')

        #Sign in
        loginUrl = urljoin(url, signInUrl)
        self.session.get(loginUrl)
        csrfToken = self.session.cookies.get_dict()['csrftoken']
        form_data = {
            'email': email,
            'password': password,
            'csrfmiddlewaretoken': csrfToken
        }
        self.session.post(loginUrl,
                          data=form_data,
                          headers={'Referer': loginUrl})

示例#10

0

显示文件

def get_citation_needed_count(url):
    """
    This gets the count of citations on a wiki page

    Args:
        url (str): Needs to be a valid url

    Returns:
        str: the number of cites needed
    """

    res = requests.get(url)

    content = res.content

    soup = bfs(content, 'html.parser')

    first_el = soup.find(id='mw-content-text')

    find_cites = first_el.find_all(
        class_='noprint Inline-Template Template-Fact')

    citations = len(find_cites)

    print(f'Number of citations needed are {citations}\n')

    return f'Number of citations needed are {citations}'

示例#11

0

显示文件

def getLinks(pageUrl):
    global page
    global reTest1
    try:
        html = urlopen(r"http://en.wikipedia.org" + pageUrl)
        soup = bfs(html, "html.parser")
    except Exception as e:
        print(e)
        return None

    try:
        print(soup.h1.get_text())  # 打印标签元素
    except Exception as e:
        print(e)

    try:
        result = soup.find("div", {
            "id": "bodyContent"
        }).findAll("a", href=reTest1)
    except Exception as e:
        print(e)
        return None
    for link in result:
        if 'href' in link.attrs:
            if link.attrs['href'] not in page:
                newPage = link.attrs['href']
                print(newPage)
                page.add(newPage)
                getLinks(newPage)
    return None

示例#12

0

显示文件

文件： GetPhotosInfo.py 项目： siaocing-liou/FlickrSpider

    def getPhotoComments(self):
        '''
        :return:返回用户评论信息;
        '''
        comment_str = ''
        comments_rest = self.flickr.photos.comments.getList(
            photo_id=self.photoId, format='rest')
        comments_lxml = bfs(comments_rest, 'lxml', from_encoding='utf8')

        comments = comments_lxml.find_all('comment')
        for comment in comments:
            try:
                # commentSearch = re.search(r'>.+?<',str(comment)).group(0)
                commentS = '{' + re.sub(
                    r'\[.+?\]|<[^>]+>|\s|\n|&.+?;|www\..+?;', ' ',
                    str(comment)) + '}'
                comment_str = comment_str + commentS

            except:

                continue
        if comment_str == '':

            return None
        else:

            return comment_str

示例#13

0

显示文件

def categoryChoice(response, url):
    soup = bfs(response.text, features="html.parser")
    catList = makeCategoryList(response, url)
    print("Vous pouvez choisir de scrapper une catégorie particulière.")
    print(
        "Si vous répondez 'non' à la question ci-dessous, la totalité du site sera scrappé."
    )
    choice = input(
        "Souhaitez-vous selectionner une catégorie (oui/non) : ").lower()
    if choice == "oui":
        print(50 * "-")
        print(
            "Choisissez la catégorie à scrapper parmis les catégories suivantes :"
        )
        print(50 * "-")
        for catNum, catName in enumerate(catList.keys()):
            print(catNum + 1, ":", catName)
        print(50 * "-")
        catChoice = input(
            "Nom de la catégorie que vous souhaitez scrapper : ").capitalize()
        if catChoice in catList.keys():
            print(f"La catégorie {catChoice} va être scrappée !")
            return catList[catChoice]
        else:
            print("Catégorie inconnue !")
            return False
    elif choice == "non":
        print("L'ensemble du site va être scrappé !")
        return catList["Books"]
    else:
        print("Choix invalide")
        return False

示例#14

0

显示文件

def get_citations_needed_report(url: str) -> str:
    """
    Gives you the elements in on the page that needed cites

    Args:
        url (str): Valid URL on wikipedia

    Returns:
        str: All the paragraphs that need cites.
    """

    res = requests.get(url)

    content = res.content

    soup = bfs(content, 'html.parser')

    first_el = soup.find(id='mw-content-text')

    p_tag = first_el.find_all('p')

    show_which = ''

    for p in p_tag:
        if 'citation needed' in p.text:
            show_which += p.text + '\n'

    print(show_which.strip())
    return show_which

示例#15

0

显示文件

文件： ipvoid.py 项目： Noobie20/malicious-IP-checker

def ipvoid():
    url = "https://www.ipvoid.com/ip-blacklist-check/"
    session = requests.Session()
    for ip in range(len(df.index)):
        current_ip = df.loc[ip, 'IP'].strip()

        try:
            pay_load = {"ip": current_ip}
            request = session.post(url, data=pay_load)
            soup = bfs(request.content, "html5lib")

            # print('configuring out {}'.format(df.loc[ip,'IP']))

            if len(soup.select('span.label.label-danger')) != 0:
                result = soup.select('span.label.label-danger')[0].get_text()
            elif len(soup.select('span.label.label-warning')) != 0:
                result = soup.select('span.label.label-warning')[0].get_text()
            else:
                result = soup.select('span.label.label-success')[-1].get_text()

            df.loc[ip, 'IPVOID'] = result

        except:
            df.loc[ip, 'IPVOID'] = 'NA'

        time.sleep(10)

示例#16

0

显示文件

文件： RMNewsRead.py 项目： 1350562074/ETI

def parse_one_page(html):
    page = s.get(html).content
    soup = bfs(page, 'lxml')
    data = soup.select('p')
    for i in data:
        if i.select('a') == []:
            print(i.get_text())
            write_to_file(i.get_text())

示例#17

0

显示文件

文件： logcrawler.py 项目： morriartie/pokemon-team-crawler

def walkto(URL,find):
	r = requests.get(URL)
	data = r.text
	soup = bfs(data,"lxml")
	texts = []
	# texts
	for txts in soup.find_all(find):
		texts.append(txts.getText())
	return texts

示例#18

0

显示文件

文件： single_crawling.py 项目： baidoosik/study_beomitechblog

def get_links():
    html = requests.get('http://myjorney.tistory.com/category/%EC%BD%94%EB%94%A9/PYTHON%20%EA%B8%B0%EB%B3%B8%EB%AC%B8%EB%B2%95').text
    soup =bfs(html,'html.parser')

    data=[]
    for url in soup.select('#body > ul > li:nth-child(1) > a'):
        data.append(url['href'])

    return data

示例#19

0

显示文件

文件： export.py 项目： dhimmel/thinklytics

def retrieve_project_urls(url, css_sel = 'a.list-proj-name'):
    """
    Retrieve the list of url for projects or proposals,
    based on the url fof the page that lists them
    """
    r = requests.get(url)
    soup = bfs(r.text, 'lxml')
    link_tags = soup.select(css_sel)
    return [x.attrs['href'][3:] for x in link_tags]

示例#20

0

显示文件

文件： export.py 项目： dhimmel/thinklytics

def retrieve_project_urls(url, css_sel='a.list-proj-name'):
    """
    Retrieve the list of url for projects or proposals,
    based on the url fof the page that lists them
    """
    r = requests.get(url)
    soup = bfs(r.text, 'lxml')
    link_tags = soup.select(css_sel)
    return [x.attrs['href'][3:] for x in link_tags]

示例#21

0

显示文件

def getUrls(originUrl, html):
    data = bfs(html, features="html5lib")

    urls = []
    for elem in data.find_all('a', href=re.compile('.+')):
        href = elem['href']
        url = validateHref(href, originUrl)
        if(url and url.geturl() not in urls):
            urls.append(url)
    return urls

示例#22

0

显示文件

文件： sohunews-split.py 项目： eledata/eledata-data-analysis

def bfs_process(path, file_name):
    split_file = split_file_part(path, file_name)
    content = []

    for part in split_file:
        soup = bfs(part)
        cnt = ''.join(soup.content.string)
        content.append(cnt)
    
    return content

示例#23

0

显示文件

文件： spiderTest.py 项目： SamLin95/CdcCrawler

def crawl_document(link):
	response = urllib2.urlopen(link)
	out = response.read()
	soup = bfs(out)
	buff = []
	paragraphs = soup.find_all("p", class_=False)
	for paragraph in paragraphs:
		content = str(paragraph).strip("<p>").strip("</p>")
		buff.append(content)
	text = "".join(buff)
	print text

示例#24

0

显示文件

def scraper_securite_routiere(urlPage):

    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = urllib2.Request(urlPage, headers=hdr)
    page1 = urllib2.urlopen(req)

    #utilisation de beautifulSoup pour parser la page
    soup = bfs(page1, 'html.parser')

    indicateurs = soup.find('div', attrs={'class': 'cadre_avec_fleches'})

    for i, link in enumerate(indicateurs.findAll('a')):

        downloadUrl = homepageUrl + link.get('href')

        if downloadUrl.endswith('.pdf'):
            #print downloadUrl
            lesUrls.append(downloadUrl)
            filenames.append(indicateurs.select('a')[i].attrs['href'])
        else:
            # scraping de la 2eme page
            req2 = urllib2.Request(downloadUrl, headers=hdr)
            page2 = urllib2.urlopen(req2)
            soup2 = bfs(page2, 'html.parser')
            #print soup2
            for cpt, lien in enumerate(soup2.find_all('a')):
                if lien.get('href').endswith('pdf'):
                    downloadlink = homepageUrl + lien.get('href')
                    lesUrls.append(downloadlink)
                    filenames.append(downloadlink)

    names_urls = zip(filenames, lesUrls)

    for name, url in names_urls:
        url = urllib2.quote(url.encode('utf8'), ':/')
        rq = urllib2.Request(url, headers=hdr)
        res = urllib2.urlopen(rq)
        # rfind recupère la dernière occurence d'un char dans une string
        pdf = open(name[name.rfind('/') + 1:], 'wb')
        pdf.write(res.read())
        pdf.close()

示例#25

0

显示文件

def parse_youtube():
    url = 'https://www.youtube.com/results?search_query=개발자'

    html = req.get(url).text
    soup = bfs(html, 'html.parser')

    data = {}

    for tag in soup.select('li > div > div > div.yt-lockup-content > h3 > a'):
        data[tag.text] = 'https://www.youtube.com/' + tag['href']

    return data

示例#26

0

显示文件

def map_image(detail_url):
    #다이닝 코드 상세 맛집 url에서 위도, 경도 정보 파싱해서 가져오기
    html = req.get(detail_url).text
    soup = bfs(html, 'html.parser')

    soup_lat = soup.select('#hdn_lat')  #위도
    soup_lng = soup.select('#hdn_lng')  #경도

    if soup_lat is not None and len(
            soup_lat) > 0 and soup_lng is not None and len(soup_lng) > 0:

        latitude = soup_lat[0]['value']
        longitude = soup_lng[0]['value']

        real_latitude = float(latitude)
        real_longitude = float(longitude)

        #folium 라이브러리 활용, 맛집에 마커된 지도 html파일 생성
        food_location = [real_latitude, real_longitude]
        map = folium.Map(location=food_location, zoom_start=25)
        folium.Marker(food_location, popup='destination').add_to(map)
        map.save('./location.html')
        map

        #selenium 라이브러리 활용, 지도 html파일을 스크린샷캡쳐해서 정적이미지 파일로 생성
        browser = webdriver.Chrome(
            'C:/Users/yurim/Desktop/chromedriver.exe')  #크롬드라이버경로넣어줘야함
        browser.get(
            'C:/Users/yurim/Documents/GitHub/capstone-capyou/code/complete_code/location.html'
        )  #지도 html경로
        browser.save_screenshot('restaurant_location.png')
        #time.sleep(2)
        #browser.quit() #동적 지도창 닫지 않기 위해서 주석 처리
        #다른 맛집 검색하거나 끝 누르면 html 창 자동으로 사라짐, 그 전에 마음대로 닫으면 오류 발생

        # slackbot 답변으로 위에서 저장된 이미지 파일 답변하기
        map_image_file = {
            'file':
            ('restaurant_location.png', open('restaurant_location.png',
                                             'rb'), 'png')
        }

        map_image_file_detail = {
            "filename": "restaurant_location.png",
            "token": token,
            "channels": ['#general']
        }
        r = req.post("https://slack.com/api/files.upload",
                     params=map_image_file_detail,
                     files=map_image_file)

    else:
        return

示例#27

0

显示文件

    def create_new_activity(self, activity_type, activity_file=None):
        activity_type = activity_type.upper()
        url = '{site}/new/activity'.format(site=self.site)

        with open(activity_file, 'r') as myfile:
            data_str = myfile.read().replace('\n', '')
        files = {
            'trackFile': (activity_file, open(activity_file,
                                              'rb'), 'multipart/form-data')
        }
        try:
            new_activity_form = self.session.get(url)
        except:
            raise EndpointConnectionError

        soup = bfs(new_activity_form.text, "html.parser")
        activities_form = soup.find_all('li', {'class': 'activityTypeItem'})
        activity_types = [
            act_type.attrs['data-value'] for act_type in activities_form
        ]
        hidden_elements = self.__get_hidden_elements('new/activity')

        if not activity_types:
            raise NoActivityTypesFound

        if activity_type not in activity_types:
            raise ActivityTypeUnknown

        hidden_elements['activityType'] = activity_type
        hidden_elements.update(self.__populate_activity_gpx(activity_file))

        file_hidden_elements = {k: v for k, v in hidden_elements.iteritems()}
        file_hidden_elements['trackFile'] = data_str
        file_hidden_elements['heartRateGraphJson'] = ''
        file_hidden_elements['route'] = ''
        file_hidden_elements['averageHeartRate'] = ''
        file_hidden_elements['hrmFile'] = ''
        file_hidden_elements['activityViewableBy'] = ''
        file_hidden_elements['calories'] = ''
        file_hidden_elements['notes'] = ''

        if activity_file.endswith('.gpx'):
            file_hidden_elements['uploadType'] = '.gpx'
        else:
            raise UnknownFileType
        try:
            if self.upload_activity(activity_file):
                new_activity_post = self.session.post(
                    url, data=file_hidden_elements, files=files)
                return new_activity_post
        except Exception as e:
            raise ErrorUploadingTrack(e)

示例#28

0

显示文件

文件： spider.py 项目： swolfod/Sogou_Words

def getEndPage(uri):
    url = HOST + uri

    res = requests.get(url, timeout=TIMEOUT)
    soup = bfs(res.text, "lxml")

    pageNums = soup.find_all(["span", "default"])
    maxPage = 0
    for nums in pageNums:
        if not nums.text.isdigit(): continue
        if int(nums.text) > maxPage:
            maxPage = int(nums.text)
    return maxPage+1

示例#29

0

显示文件

文件： logcrawler.py 项目： morriartie/pokemon-team-crawler

def walkto2(URL):
	r = requests.get(URL)
	data = r.text
	soup = bfs(data,"lxml")
	doors = []
	texts = []
	# links
	for link in soup.find_all('a'):
		doors.append(link.get('href'))
	# texts
	for txts in soup.find_all('p'):
		texts.append(txts.getText())
	return doors, texts

示例#30

0

显示文件

def get_url():
    arr = {}
    for n in range(59, 1000):
        soup = bfs(requests.get(url + format(n) + "/").content,
                   "html.parser").find("div", {
                       "id": "wallwindow"
                   }).findChildren()
        for img in soup:
            if (not "WP_" in img.get('src')): pass
            else:
                arr[n] = img.get('src')
                for value in arr:
                    print value, arr[value]

示例#31

0

显示文件

文件： get_docs.py 项目： Web-Sheriff/InnoGruk-Search

def get_docs(address):
    tr = tar.open(address, "r:gz", encoding="latin-1")

    for member in tr.getmembers():
        tr_file = tr.extractfile(member)
        if tr_file is not None:
            content = tr_file.read()
            text = content.decode('utf-8', 'ignore')
            docs = text.split("</REUTERS>")
            for doc in docs:
                filtered = bfs(doc, features="html.parser").get_text()
                yield filtered
    return 3

示例#32

0

显示文件

文件： main.py 项目： Sleepwalker15/get_site_meta

def getting_html(site, headers):
    try:
        session = requests.Session()
        request = session.get(site, headers=headers)
        status = request.status_code
        page_html = bfs(request.text, "lxml")
        if status == 200:

            return getting_url(page_html)

    except Exception as name_error:
        print("Error!", name_error)
        print(sys.exc_info()[1])

示例#33

0

显示文件

def get_content(link):
    request_headers = {
        'User-Agent':
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
         '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
        'Referer':
        'http://myjorney.tistory.com',  # 뉴스홈
    }
    abs_link = 'http://myjorney.tistory.com' + link
    html = req.get(abs_link, headers=request_headers).text
    soup = bfs(html, 'html.parser')
    # 가져온 데이터로 뭔가 할 수 있겠죠?
    # 하지만 일단 여기서는 시간만 확인해봅시다.
    print(soup.select('#head > h2 > a')[0].text)  # 첫 h1 태그를 봅시다.

示例#34

0

显示文件

文件： spiderTest.py 项目： SamLin95/CdcCrawler

def crawl_index(filename="sampleB.html"):
	soup = bfs(open(filename))
	mainContent = soup.find("div", class_="main-inner")
	dates = mainContent.find_all("p")
	links = mainContent.find_all("ul")
	for idx, date in enumerate(dates):
		#bfs4 will also extract "\n from text"
		string = date.find("strong").string.split(" / ")[0]
		day, year = string.split(", ")
		print "start crawling for date %s year %s...."%(day, year)
		currentLinks = links[idx]
		a = currentLinks.find_all('a')
		for link in a:
			link = "http://www.cdc.gov/mmwr/preview/" + link.get("href")
			print "link %s for the %s year %s"%(link, day, year)

示例#35

0

显示文件

文件： spiderTest.py 项目： SamLin95/CdcCrawler

def getLinks(historicalIndexLink):
	try:
		response = urllib2.urlopen(historicalIndexLink)
		content = response.read()
		soup = bfs(content)
		mainContent = soup.find("div", class_="main-inner")
		links = mainContent.find_all("a") #traverse the tree find all <a>
		hrefs = [link.get("href") for link in links if link.get("href") is not None]
		hrefs = map(lambda x: "http://www.cdc.gov" + x, hrefs)
		return hrefs
	except urllib2.HTTPError as e1:
		print "please check your url for %s"%(historicalIndexLink,), e1.reason
		return [] 
	except urllib2.URLError as e2:
		print "please check your link for %s"%(historicalIndexLink), e2.reason
		return []

示例#36

0

显示文件

文件： runkeeper.py 项目： wefner/runkeeper

    def create_new_activity(self, activity_type, activity_file=None):
        activity_type = activity_type.upper()
        url = '{site}/new/activity'.format(site=self.site)

        with open(activity_file, 'r') as myfile:
            data_str = myfile.read().replace('\n', '')
        files = {'trackFile': (activity_file, open(activity_file, 'rb'), 'multipart/form-data')}
        try:
            new_activity_form = self.session.get(url)
        except:
            raise EndpointConnectionError

        soup = bfs(new_activity_form.text, "html.parser")
        activities_form = soup.find_all('li', {'class': 'activityTypeItem'})
        activity_types = [act_type.attrs['data-value'] for act_type in activities_form]
        hidden_elements = self.__get_hidden_elements('new/activity')

        if not activity_types:
            raise NoActivityTypesFound

        if activity_type not in activity_types:
            raise ActivityTypeUnknown

        hidden_elements['activityType'] = activity_type
        hidden_elements.update(self.__populate_activity_gpx(activity_file))

        file_hidden_elements = {k: v for k, v in hidden_elements.iteritems()}
        file_hidden_elements['trackFile'] = data_str
        file_hidden_elements['heartRateGraphJson'] = ''
        file_hidden_elements['route'] = ''
        file_hidden_elements['averageHeartRate'] = ''
        file_hidden_elements['hrmFile'] = ''
        file_hidden_elements['activityViewableBy'] = ''
        file_hidden_elements['calories'] = ''
        file_hidden_elements['notes'] = ''

        if activity_file.endswith('.gpx'):
            file_hidden_elements['uploadType'] = '.gpx'
        else:
            raise UnknownFileType
        try:
            if self.upload_activity(activity_file):
                new_activity_post = self.session.post(url, data=file_hidden_elements, files=files)
                return new_activity_post
        except Exception as e:
            raise ErrorUploadingTrack(e)

示例#37

0

显示文件

文件： runkeeper.py 项目： wefner/runkeeper

    def profile_username(self):
        """
        Get profile username or ID once logged in by using Session object
        :return: str
        """
        if not self.__profile_username:
            url = "{site}/home".format(site=self.site)
            try:
                home = self.session.get(url)
            except:
                raise EndpointConnectionError

            soup = bfs(home.text, "html.parser")
            profile_url = soup.find('a', {'href': re.compile('/user/[a-zA-Z]|[0-9]/profile')})

            try:
                self.__profile_username = profile_url.attrs['href'].split('/')[2]
            except IndexError:
                raise ProfileNotFound

        return self.__profile_username

示例#38

0

显示文件

文件： runkeeper.py 项目： wefner/runkeeper

    def get_activity_datetime(self, activity_id):
        """
        :param activity_id: String
        :return: datetime object.
        """
        url = "{site}/user/{profile}/activity/{activity_id}".format(site=self._runkeeper.site,
                                                                    profile=self._runkeeper.profile_username,
                                                                    activity_id=activity_id)
        try:
            activity_datetime_session = self.session.get(url)
        except:
            raise EndpointConnectionError

        soup = bfs(activity_datetime_session.text, "html.parser")
        form = soup.find('div', {'class': 'micro-text activitySubTitle'})

        activity_datetime = [date_params.split('-')[0].rstrip() for date_params in form]
        activity_datetime = (''.join(activity_datetime))
        activity_datetime = datetime.strptime(activity_datetime, '%a %b %d %H:%M:%S %Z %Y')

        return activity_datetime

示例#39

0

显示文件

文件： runkeeper.py 项目： wefner/runkeeper

    def __get_hidden_elements(self, endpoint):
        """
        Retrieve all <hidden> parameters from requested form
        :return: dict
        """
        url = "{site}/{endpoint}".format(site=self.site,
                                         endpoint=endpoint)
        try:
            endpoint_form = self.session.get(url)
        except:
            raise EndpointConnectionError

        soup = bfs(endpoint_form.text, "html.parser")

        try:
            form = soup.find_all('input', {'type': 'hidden'})
        except:
            raise HiddenElementsNotFound

        hidden_elements = {element.attrs['name']: element.attrs['value'] for element in form}

        return hidden_elements

示例#40

0

显示文件

文件： WebCheck.py 项目： SubhaRadhakrishnan/365DaysPython

__Date__ = '11/10/2015'
"""
From "http://www.practicepython.org/"*****************
16)Use the BeautifulSoup and requests Python packages to print out a list of all
the article titles on the New York Times homepage.
http://www.nytimes.com/
"""
from bs4 import BeautifulSoup as bfs
import requests
nyurl='http://www.nytimes.com/'
html = requests.get(nyurl)
soup=bfs(html.text,"html.parser")
for story_heading in soup.find_all(class_="story-heading"):
    if story_heading.a:
        print(story_heading.a.text.replace("\n", " ").strip())
    else:
        print(story_heading.contents[0].strip())

示例#41

0

显示文件

文件： spiderTest.py 项目： SamLin95/CdcCrawler

def get_page_links(volumePage):
	soup = bfs(volumePage)
	mainContent = soup.find("div", class_="mSyndicate")
	links = mainContent.find("a")